In [58]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from tab_transformer_pytorch import FTTransformer

In [59]:
df_train = pd.read_csv('/home/warin/projects/CAT-Transformer/datasets/income/train.csv')
df_test = pd.read_csv('/home/warin/projects/CAT-Transformer/datasets/income/test.csv')
df_val = pd.read_csv('/home/warin/projects/CAT-Transformer/datasets/income/validation.csv')

In [60]:
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,27,4,153475,9,13,4,12,1,4,0,0,0,40,39,0
1,38,4,122076,15,10,2,14,0,4,1,7298,0,43,39,1
2,39,5,206362,8,11,2,3,0,4,1,0,0,40,39,0
3,32,4,137076,15,10,2,4,0,4,1,15024,0,60,39,1
4,46,4,198660,11,9,0,8,1,4,1,0,0,40,39,0


In [61]:
cat_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']

for x in cat_columns:
    print(len(df_train[x].value_counts()))

10
16
7
16
6
5
2
42


In [62]:
class SingleTaskDataset(Dataset):
    def __init__(self, df : pd.DataFrame, cat_columns, num_columns,task1_column):
        self.n = df.shape[0]
        
        self.task1_labels = df[task1_column].astype(np.int64).values

        self.cate = df[cat_columns].astype(np.int64).values
        self.num = df[num_columns].astype(np.float32).values


    def __len__(self):
        return self.n
    
    def __getitem__(self, idx):
        # Retrieve features and labels from the dataframe using column names
        cat_features = self.cate[idx]
        num_features = self.num[idx]
        labels_task1 = self.task1_labels[idx]

        return cat_features, num_features, labels_task1
        # return self.x[index], self.task1_labels[index], self.task2_labels[index]

cat_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
num_columns = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
train_dataset = SingleTaskDataset(df_train, cat_columns, num_columns, 'income')
val_dataset = SingleTaskDataset(df_val, cat_columns, num_columns, 'income')
test_dataset = SingleTaskDataset(df_test, cat_columns, num_columns, 'income')

batch_size = 256

# Wrapping with DataLoader for easy batch extraction
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [63]:
class UncertaintyLoss(nn.Module):
    def __init__(self, num_tasks):
        super(UncertaintyLoss, self).__init__()
        self.num_tasks = num_tasks

        self.loss_fns = [nn.CrossEntropyLoss() for x in range(num_tasks)] 

    def forward(self, prediction, labels_task1):

        #task 1
        target = labels_task1
        # prediction = predictions[0]
        loss_fn = self.loss_fns[0]
        task_loss = loss_fn(prediction, target)
        
        return task_loss
    
# Training and Testing Loops
def train(dataloader, model, loss_function, optimizer, device_in_use):
    model.train()

    total_loss = 0

    total_correct_1 = 0
    total_samples_1 = 0
    all_targets_1 = []
    all_predictions_1 = []

    total_correct_2 = 0
    total_samples_2 = 0
    all_targets_2 = []
    all_predictions_2 = []

    for (cat, num,labels_task1) in dataloader:
        cat, num,labels_task1 = cat.to(device_in_use), num.to(device_in_use),labels_task1.to(device_in_use)


        task_predictions = model(cat,num) #contains a list of the tensor outputs for each task
        task_predictions = task_predictions.squeeze(1)
        print("prediction shape",task_predictions.shape)
        print("labels shape", labels_task1.shape)
        print(task_predictions)
        print(labels_task1)
        loss = loss_function(task_predictions, labels_task1)
        total_loss += loss.item()

        #computing accuracy for first target
        y_pred_softmax_1 = torch.softmax(task_predictions[0], dim=1)
        _, y_pred_labels_1 = torch.max(y_pred_softmax_1, dim=1)
        total_correct_1 += (y_pred_labels_1 == labels_task1).sum().item()
        total_samples_1 += labels_task1.size(0)
        all_targets_1.extend(labels_task1.cpu().numpy())
        all_predictions_1.extend(y_pred_labels_1.cpu().numpy())

        # #computing accuaracy for second target
        # y_pred_softmax_2 = torch.softmax(task_predictions[1], dim=1)
        # _, y_pred_labels_2 = torch.max(y_pred_softmax_2, dim=1)
        # total_correct_2 += (y_pred_labels_2 == labels_task2).sum().item()
        # total_samples_2 += labels_task2.size(0)
        # all_targets_2.extend(labels_task2.cpu().numpy())
        # all_predictions_2.extend(y_pred_labels_2.cpu().numpy())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    avg_loss = total_loss/len(dataloader)
    accuracy_1 = total_correct_1 / total_samples_1
    # accuracy_2 = total_correct_2 / total_samples_2

    # # precision = precision_score(all_targets, all_predictions, average='weighted')
    # recall = recall_score(all_targets, all_predictions, average='weighted')
    # f1 = f1_score(all_targets, all_predictions, average='weighted')

    return avg_loss, accuracy_1

def test(dataloader, model, loss_function, device_in_use):
  model.eval()
  total_loss = 0
  
  total_correct_1 = 0
  total_samples_1 = 0
  all_targets_1 = []
  all_predictions_1 = []

  total_correct_2 = 0
  total_samples_2 = 0
  all_targets_2 = []
  all_predictions_2 = []

  with torch.no_grad():
    for (cat, num,labels_task1) in dataloader:
        cat, num,labels_task1 = cat.to(device_in_use), num.to(device_in_use),labels_task1.to(device_in_use)


        task_predictions = model(cat,num) #contains a list of the tensor outputs for each task

        loss = loss_function(task_predictions, labels_task1)
        total_loss += loss.item()

        #computing accuracy for first target
        y_pred_softmax_1 = torch.softmax(task_predictions[0], dim=1)
        _, y_pred_labels_1 = torch.max(y_pred_softmax_1, dim=1)
        total_correct_1 += (y_pred_labels_1 == labels_task1).sum().item()
        total_samples_1 += labels_task1.size(0)
        all_targets_1.extend(labels_task1.cpu().numpy())
        all_predictions_1.extend(y_pred_labels_1.cpu().numpy())

        # #computing accuaracy for second target
        # y_pred_softmax_2 = torch.softmax(task_predictions[1], dim=1)
        # _, y_pred_labels_2 = torch.max(y_pred_softmax_2, dim=1)
        # total_correct_2 += (y_pred_labels_2 == labels_task2).sum().item()
        # total_samples_2 += labels_task2.size(0)
        # all_targets_2.extend(labels_task2.cpu().numpy())
        # all_predictions_2.extend(y_pred_labels_2.cpu().numpy())

    avg = total_loss/len(dataloader)
    accuracy_1 = total_correct_1 / total_samples_1
    # accuracy_2 = total_correct_2 / total_samples_2
    # recall = recall_score(all_targets, all_predictions, average='weighted')
    f1_1 = f1_score(all_targets_1, all_predictions_1, average='weighted')
    # f1_2 = f1_score(all_targets_2, all_predictions_2, average="weighted")

    return avg, accuracy_1, all_predictions_1, all_targets_1, f1_1

In [14]:
x_categ = torch.randint(0, 5, (1, 5))
x_numer = torch.randn(1, 10)

x_categ

tensor([[4, 2, 4, 4, 2]])

In [64]:
model = FTTransformer(categories=(10,16,7,16,6,5,2,42),
                      num_continuous=len(num_columns), 
                      dim = 32,                           # dimension, paper set at 32
                      dim_out = 1,                        # binary prediction, but could be anything
                      depth = 6,                          # depth, paper recommended 6
                      heads = 8,                          # heads, paper recommends 8
                      attn_dropout = 0.1,                 # post-attention dropout
                      ff_dropout = 0.1 )

optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)
loss_function = UncertaintyLoss(1)
for epoch in range(5):
    train_loss, train_accuracy = train(train_dataloader, model, loss_function, optimizer, 'cpu')
        
    # Validation loop
    val_loss, val_accuracy, _, _, _ = test(val_dataloader, model, loss_function, 'cpu')

IndexError: index out of range in self