In [69]:
import torchhd
import pandas as pd
from ucimlrepo import fetch_ucirepo
import torch
from torch.utils.data import DataLoader, TensorDataset

In [70]:
# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets 

In [71]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States


In [72]:
y

Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K
...,...
48837,<=50K.
48838,<=50K.
48839,<=50K.
48840,<=50K.


### Metadata

In [73]:
# metadata
print(adult.metadata)
feature_names = X.columns
print(feature_names)

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

### Get numerical and categorical columns

In [74]:
numerical = list(X.select_dtypes(include='number').columns)
categorical = list(set(X.columns[1:])-set(numerical))
numerical, categorical

(['age',
  'fnlwgt',
  'education-num',
  'capital-gain',
  'capital-loss',
  'hours-per-week'],
 ['race',
  'relationship',
  'native-country',
  'education',
  'sex',
  'occupation',
  'marital-status',
  'workclass'])

### Convert categorical columns to categorical

In [75]:
for column in categorical:
    print(column)
    X[column]= X[column].astype('category')

race
relationship
native-country
education
sex
occupation
marital-status
workclass


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column]= X[column].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column]= X[column].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column]= X[column].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row

In [76]:
X.select_dtypes(include='category').describe()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
count,47879,48842,48842,47876,48842,48842,48842,48568
unique,9,16,7,15,6,5,2,42
top,Private,HS-grad,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States
freq,33906,15784,22379,6172,19716,41762,32650,43832


In [77]:
X.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,38.643585,189664.1,10.078089,1079.067626,87.502314,40.422382
std,13.71051,105604.0,2.570973,7452.019058,403.004552,12.391444
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117550.5,9.0,0.0,0.0,40.0
50%,37.0,178144.5,10.0,0.0,0.0,40.0
75%,48.0,237642.0,12.0,0.0,0.0,45.0
max,90.0,1490400.0,16.0,99999.0,4356.0,99.0


In [78]:
y['income']=y['income'].apply(lambda s : s.replace('.', ''))
y['income']=y['income'].astype('category')
y['income']=y['income'].cat.codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['income']=y['income'].apply(lambda s : s.replace('.', ''))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['income']=y['income'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['income']=y['income'].cat.codes


In [79]:
y

Unnamed: 0,income
0,0
1,0
2,0
3,0
4,0
...,...
48837,0
48838,0
48839,0
48840,0


## Data splits

In [80]:
seed=42
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

## Standardize numerical features

In [81]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train[numerical]=scaler.fit_transform(X_train[numerical])

In [82]:
X_test[numerical] = scaler.transform(X_test[numerical])
X_test.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,9769.0,9769.0,9769.0,9769.0,9769.0,9769.0
mean,0.003976,-0.016201,0.000806,-0.008233,0.002801,0.008711
std,0.99008,0.989734,1.011133,0.9676,1.002387,0.999136
min,-1.5747,-1.613192,-3.53873,-0.145524,-0.216668,-3.179147
25%,-0.77398,-0.690619,-0.420108,-0.145524,-0.216668,-0.032339
50%,-0.118846,-0.114461,-0.03028,-0.145524,-0.216668,-0.032339
75%,0.681874,0.431296,0.749376,-0.145524,-0.216668,0.371098
max,3.739167,11.114471,2.308687,13.187777,9.465272,4.728217


## Ensure Reproducibility 

In [83]:
torch.manual_seed(seed);

## Make Dataloader

In [None]:
def get_default_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

def create_dataloader(x, y, batch_size : int = 32) -> DataLoader:
    
    x_copy = x.copy()
    y_copy = y.copy()
    
    for col in categorical:
        x_copy[col] = pd.Categorical(x_copy[col]).codes + 1 #Deal with NaN by increasing

    x_tens = torch.tensor(x_copy[categorical + numerical].values, dtype=torch.float32)
    y_tens = torch.tensor(y_copy.values, dtype=torch.long).squeeze()    
    
    dataset = TensorDataset(x_tens, y_tens)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True, generator=torch.Generator(get_default_device()))

## Set Up And Train Model

In [None]:
#Get the best performance
torch.set_num_threads(10)
if torch.cuda.is_available():
    torch.set_default_device('cuda')

### AdaptHD

In [None]:
model = torchhd.classifiers.AdaptHD(n_features=X.shape[1], n_dimensions=10000, n_classes=2)
model.fit(create_dataloader(X_train, y_train))

fit: 100%|██████████| 120/120 [26:18<00:00, 13.15s/it]


AdaptHD(
  (keys): Random(14, 10000)
  (levels): Level(100, 10000)
  (model): Centroid(in_features=10000, out_features=3)
)

In [None]:
model.accuracy(create_dataloader(X_test, y_test))

0.8420513870406388

### OnlineHD

In [None]:
model = torchhd.classifiers.OnlineHD(n_features=X.shape[1], n_dimensions=10000, n_classes=2)
model.fit(create_dataloader(X_train, y_train))

In [None]:
model.accuracy(create_dataloader(X_test, y_test))

## Create New Model

In [117]:
class FHRRMixedEncoder(torch.nn.Module):
    """Will do an FHRR encoding that takes treats both the numerical and categorical data correctly"""
    def __init__(self, n_numerical_features : int, 
                 n_dimensions : int, 
                 n_categorical_bin_list : list[int], 
                 device : torch.device = None, 
                 distribution : str = 'sinc'):
        super().__init__()
        self._numerical_encoding = torchhd.embeddings.FractionalPower(n_numerical_features, n_dimensions, distribution, device=device)
        self._list_categorical_encoding = [torchhd.embeddings.Random(i, n_dimensions, vsa='FHRR') for i in n_categorical_bin_list]
        self._hyperdim = n_dimensions
        self._num_cat_features = len(self._list_categorical_encoding)
    
    def forward(self, samples : torch.Tensor) -> torch.Tensor:
        """Will encode the given data into a hypervector. The categorical features is expected to take up the first n features

        Args:
            samples (torch.Tensor): A (batch, num_features) matrix of samples to encode

        Returns:
            torch.Tensor: A (batch, hyperdim) matrix of encoded samples
        """
        reshape_at_end = False
        if len(samples.shape) == 1:
            samples = samples.view(1, -1)
            reshape_at_end = True
        elif len(samples.shape) > 2:
            raise ValueError(f'Input has shape of: {samples.shape} which is not supported')
        
        b, _ = samples.shape
        
        hvec = torch.ones((b, self._hyperdim), dtype=torch.cfloat)
        
        #Iterate over the columns of samples or in other words each feature
        for cat_feature, cat_encoder in zip(samples.T[:self._num_cat_features], self._list_categorical_encoding): 
                hvec *= cat_encoder(cat_feature.long())
                
        hvec *= self._numerical_encoding(samples[:, self._num_cat_features:])

        if reshape_at_end:
            hvec = hvec.squeeze()

        return hvec
class FHRRAdaptModel(torchhd.classifiers.AdaptHD):
    def __init__(self, n_numerical_features : int,
                 n_categorical_bin_list : list[int], 
                 n_dimensions : int, 
                 n_classes : int, *, 
                 n_levels : int= 100, 
                 min_level : int= -1, 
                 max_level : int= 1, 
                 epochs : int= 120, 
                 lr : float= 0.035,
                 device : torch.device= None, 
                 dtype : torch.dtype= None):
        super().__init__(n_numerical_features, n_dimensions, n_classes, n_levels=n_levels, min_level=min_level, max_level=max_level, epochs=epochs, lr=lr, device=device, dtype=dtype)
        
        self._encoder = FHRRMixedEncoder(n_numerical_features, n_dimensions, n_categorical_bin_list, device)
        self.model.to(torch.cfloat)
        
    def encoder(self, samples : torch.Tensor):
        return self._encoder(samples) 
        
class FHRROnlineModel(torchhd.classifiers.OnlineHD):
    def __init__(self, n_numerical_features : int, 
                 n_categorical_bin_list : list[int],
                 n_dimensions : int, 
                 n_classes : int, *, 
                 epochs : int = 120, 
                 lr :float = 0.035, 
                 device : torch.device = None,
                 dtype : torch.dtype= None):
        
        super().__init__(n_numerical_features, n_dimensions, n_classes, epochs=epochs, lr=lr, device=device, dtype=dtype)
        
        self.encoder = FHRRMixedEncoder(n_numerical_features, n_dimensions, n_categorical_bin_list, device)
        self.model.to(dtype=torch.cfloat)        

## Train New Model

In [None]:
num_unique_classes = [X[col].nunique() + 1 for col in categorical] #+1 for NaN class

[6, 7, 43, 17, 3, 16, 8, 10]


### AdaptHD with FHRR

In [118]:
n_numerical_features = X.shape[1] - len(num_unique_classes)
model = FHRRAdaptModel(n_numerical_features=n_numerical_features,
                        n_categorical_bin_list=num_unique_classes,
                        n_dimensions=10000,
                        n_classes=2)
model.fit(create_dataloader(X_train, y_train))

fit:   0%|          | 0/120 [00:00<?, ?it/s]

fit: 100%|██████████| 120/120 [05:55<00:00,  2.96s/it]


FHRRAdaptModel(
  (keys): Random(6, 10000)
  (levels): Level(100, 10000)
  (model): Centroid(in_features=10000, out_features=2)
  (_encoder): FHRRMixedEncoder(
    (_numerical_encoding): FractionalPower()
  )
)

In [119]:
model.accuracy(create_dataloader(X_test, y_test))

0.596478656976149

### OnlineHD with FHRR

In [115]:
n_numerical_features = X.shape[1] - len(num_unique_classes)
model = FHRROnlineModel(n_numerical_features=n_numerical_features,
                        n_categorical_bin_list=num_unique_classes,
                        n_dimensions=10000,
                        n_classes=2)
model.fit(create_dataloader(X_train, y_train))

fit:   0%|          | 0/120 [00:00<?, ?it/s]

fit: 100%|██████████| 120/120 [06:09<00:00,  3.08s/it]


FHRROnlineModel(
  (encoder): FHRRMixedEncoder(
    (_numerical_encoding): FractionalPower()
  )
  (model): Centroid(in_features=10000, out_features=2)
)

In [116]:
model.accuracy(create_dataloader(X_test, y_test))

0.5963762923533626