# 1. Import

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from transformers import BertModel, BertTokenizer

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

In [2]:
## Parameters

# Preprocessing Parameters
validation_size = 0.2
RAND_STATE = 5780
shuffle_split = True

# 2. Data Loading

In [3]:
import numpy as np

def load_npz(file_path):
    with np.load(file_path) as data:
        return {key: data[key] for key in data}

train_data = load_npz(r'.\data\train.npz')
test_data = load_npz(r'.\data\test.npz')
train_emb1, train_emb2, train_labels = train_data['emb1'], train_data['emb2'], train_data['preference']
test_emb1, test_emb2 = test_data['emb1'], test_data['emb2']

# 3. Exploration

In [4]:
train_data

{'uid': array([    0,     1,     2, ..., 18747, 18748, 18749], dtype=int64),
 'emb1': array([[-0.05075016, -0.03491386, -0.05787281, ...,  0.00020284,
          0.02388327, -0.02491781],
        [-0.12402835, -0.07631648, -0.05782915, ...,  0.02713838,
          0.01394665,  0.0186507 ],
        [-0.06794146, -0.0385992 ,  0.04476113, ...,  0.07999779,
          0.04943484,  0.00783883],
        ...,
        [ 0.02096516, -0.00752076, -0.06958353, ...,  0.01346127,
          0.01917063, -0.06059628],
        [-0.00901941,  0.01330765, -0.02343761, ..., -0.02690429,
          0.0084649 ,  0.01999134],
        [-0.05510234,  0.00251053, -0.01775946, ...,  0.00322949,
         -0.02700103,  0.01986161]], dtype=float32),
 'emb2': array([[-0.03255587,  0.01327268, -0.00508326, ..., -0.01196616,
         -0.03564733, -0.03713938],
        [-0.00014027,  0.03904634,  0.0592997 , ...,  0.00117963,
          0.04012304,  0.07394706],
        [-0.068197  , -0.0943828 ,  0.04236921, ...,  0.02259

In [7]:
for key, value in train_data.items():
    print(f"Length of '{key}': {len(value) if isinstance(value, np.ndarray) else 'Not an array'}")

Length of 'uid': 18750
Length of 'emb1': 18750
Length of 'emb2': 18750
Length of 'preference': 18750


In [6]:
train_data.keys()

dict_keys(['uid', 'emb1', 'emb2', 'preference'])

In [7]:
# x1
print(train_data['emb1'][0].shape) # (384,)
# x2
print(train_data['emb2'][0].shape) # (384,)
# y
print(train_data['preference'][0]) # 1
# train_data['emb1'][0]

(384,)
(384,)
1


# 4. Preprocessing

In [8]:
def train_validation_split(Xs, Ys, validation_size: float=0.2):
    Xs_tr, Xs_va, Ys_tr, Ys_va = train_test_split(Xs, Ys, test_size=validation_size, random_state=RAND_STATE, shuffle=shuffle_split, stratify=Ys)
    return torch.Tensor(Xs_tr), torch.Tensor(Xs_va), torch.Tensor(Ys_tr).long(), torch.Tensor(Ys_va).long()

In [9]:
print(train_data['emb1'].shape) # (n x d): (18750, 384)
print(train_data['emb2'].shape) # (n x d): (18750, 384)

# Concatenate the input in to a single long vector
Xs = np.concatenate((train_data['emb1'], train_data['emb2']), axis=1)
Ys = train_data['preference']

# Train Validation Split
Xs_tr, Xs_va, Ys_tr, Ys_va = train_validation_split(Xs, Ys, validation_size)

# Convert to Torch
print(f'Xs_tr.shape: {Xs_tr.shape}') # (15000, 768)
print(f'Ys_tr.shape: {Ys_tr.shape}') # (15000,)
print(f'Xs_va.shape: {Xs_va.shape}') # (3750, 768)
print(f'Ys_va.shape: {Ys_va.shape}') # (3750,)

(18750, 384)
(18750, 384)
Xs_tr.shape: torch.Size([15000, 768])
Ys_tr.shape: torch.Size([15000])
Xs_va.shape: torch.Size([3750, 768])
Ys_va.shape: torch.Size([3750])


In [10]:
Xs_tr

tensor([[-0.0525,  0.0358, -0.1479,  ..., -0.0207, -0.0516, -0.0119],
        [-0.0813, -0.0833,  0.0329,  ...,  0.0286,  0.0443,  0.0085],
        [-0.0362, -0.0262,  0.0381,  ...,  0.0300,  0.0761,  0.0385],
        ...,
        [ 0.0056, -0.0284, -0.0130,  ...,  0.0343,  0.0795,  0.0295],
        [-0.0359, -0.0749, -0.0522,  ..., -0.0366, -0.0189, -0.0320],
        [-0.0801, -0.0949,  0.0201,  ...,  0.0674, -0.0343,  0.0003]])

In [11]:
Ys_tr

tensor([1, 1, 1,  ..., 1, 0, 0])

# 5. Model

In [11]:
# Parameters
embedding_dim = 768
hidden_dim = 128
output_dim = 2
num_layers = 3

In [12]:
# FFNN Model
class FFNN(nn.Module):
    def __init__(self, embedding_dim: int, hidden_dim: int, output_dim: int, num_layers: int = 1) -> None:
        super().__init__()
        assert num_layers > 0

        self.num_layers = num_layers
        self.layers = nn.ModuleList()
        self.layers.append(nn.Linear(embedding_dim, hidden_dim))
        for _ in range(num_layers - 1):
            self.layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
        x = embeddings
        for layer in self.layers:
            x = F.relu(layer(x))
        output = self.output_layer(x)
        return output

In [13]:
class FFNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers=1):
        super().__init__()
        assert num_layers > 0

        self.layers = nn.ModuleList([nn.Linear(embedding_dim, hidden_dim)])
        self.batch_norms = nn.ModuleList([nn.BatchNorm1d(hidden_dim)])

        for _ in range(num_layers - 1):
            self.layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.batch_norms.append(nn.BatchNorm1d(hidden_dim))

        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, embeddings):
        x = embeddings
        for layer, batch_norm in zip(self.layers, self.batch_norms):
            x = F.relu(layer(x))
            x = batch_norm(x)
        output = self.output_layer(x)
        return output

In [14]:
# Test
ffnn = FFNN(embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers)
ffnn

FFNN(
  (layers): ModuleList(
    (0): Linear(in_features=768, out_features=128, bias=True)
    (1-2): 2 x Linear(in_features=128, out_features=128, bias=True)
  )
  (batch_norms): ModuleList(
    (0-2): 3 x BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (output_layer): Linear(in_features=128, out_features=2, bias=True)
)

In [15]:
class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.hidden_layers = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Forward pass through the RNN layer
        out, _ = self.hidden_layers(x)

        # Take the output from the last time step and pass it through the fully connected layer
        out = self.output_layer(out)
        return out

In [16]:
rnn = RNN(embedding_dim, hidden_dim, output_dim, num_layers)
rnn

RNN(
  (hidden_layers): RNN(768, 128, num_layers=3, batch_first=True)
  (output_layer): Linear(in_features=128, out_features=2, bias=True)
)

In [17]:
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.hidden_layers = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Forward pass through the RNN layer
        out, _ = self.hidden_layers(x)

        # Take the output from the last time step and pass it through the fully connected layer
        out = self.output_layer(out)
        return out

In [18]:
lstm = LSTM(embedding_dim, hidden_dim, output_dim, num_layers)
lstm

LSTM(
  (hidden_layers): LSTM(768, 128, num_layers=3, batch_first=True)
  (output_layer): Linear(in_features=128, out_features=2, bias=True)
)

In [19]:
class BERT(nn.Module):
    def __init__(self, embedding_dim, output_dim=2):
        super(BERT, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.classifier = nn.Linear(embedding_dim, output_dim)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

In [20]:
class BERT(nn.Module):
    def __init__(self, embedding_dim, output_dim=2):
        super(BERT, self).__init__()
        self.reduction_layer = nn.Linear(embedding_dim, 512)
        self.classifier = nn.Linear(512, output_dim)

    # def forward(self, x):
    #     reduced_x = self.reduction_layer(x)
    #     # You can add activation functions and other layers here if needed
    #     output = self.classifier(reduced_x)
    #     return output
    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

In [21]:
from transformers import BertModel
import torch.nn as nn
import torch.nn.functional as F

class BERT(nn.Module):
    def __init__(self, output_dim=2, dropout_rate=0.1):
        super(BERT, self).__init__()
        # Initialize the BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # Dropout layer for regularization
        self.dropout = nn.Dropout(dropout_rate)

        # Classifier layer
        self.classifier = nn.Linear(self.bert.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask=None):
        # BERT model outputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Pooler output for classification
        pooled_output = outputs.pooler_output

        # Apply dropout
        dropped_out = self.dropout(pooled_output)

        # Pass through the classifier
        output = self.classifier(dropped_out)

        return output


In [22]:
# bert = BERT(embedding_dim, hidden_dim, output_dim, num_layers)
bert = BERT(output_dim)
bert

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)

# 6. Model Training

In [52]:
# Parameters
epochs = 10
batch_size = 128
lr = 0.00001
rho1 = 0.99
rho2 = 0.999
grad_clip_max_norm = 1
sgd_optimizer = torch.optim.SGD(ffnn.parameters(), lr=lr)
adam_optimizer = torch.optim.Adam(ffnn.parameters(), lr=lr)
binary_cross_entropy_loss_fn = torch.nn.BCELoss()
cross_entropy_loss_fn = torch.nn.CrossEntropyLoss()

In [24]:
# evaluate a trained model on MNIST data
#
# dataloader    dataloader of examples to evaluate on
# model         trained PyTorch model
# loss_fn       loss function (e.g. torch.nn.CrossEntropyLoss)
#
# returns       tuple of (loss, accuracy), both python floats
@torch.no_grad()
def evaluate_model(Xs_va, Ys_va, model, loss_fn):
	model.eval()
	total_loss = 0.0
	total_correct = 0
	total_samples = 0

	# Create DataLoader for batching
	validation_dataset = TensorDataset(Xs_va, Ys_va)
	validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)	

	for X, Y in validation_loader:
		Y_pred_prob = model(X)
		loss = loss_fn(Y_pred_prob, Y)
		total_loss += loss.item()
	
		Y_pred = torch.argmax(Y_pred_prob, dim=1)
		total_correct += torch.sum(Y_pred == Y).item()
		total_samples += Y.size(0)
	
	average_loss = total_loss / len(validation_loader)
	accuracy = total_correct / total_samples
	
	return average_loss, accuracy

In [25]:
def train(Xs_tr, Ys_tr, Xs_va, Ys_va, model, loss_fn, optimizer, epochs, grad_clip_max_norm):
	# Create DataLoader for batching
	train_dataset = TensorDataset(Xs_tr, Ys_tr)
	train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
	
	for epoch in range(epochs):
		# Set to training mode
		model.train()
		
		for i, (X, Y) in enumerate(train_loader):
			total_loss = 0.0

			# Zero gradients for every batch
			optimizer.zero_grad()

			# Make predictions for this batch
			Y_pred_prob = model(X)

			# Compute the loss and its gradients
			loss = loss_fn(Y_pred_prob, Y)
			loss.backward()

			if grad_clip_max_norm is not None:
				nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_max_norm)

			# Adjust learning weights
			optimizer.step()

			# Gather data and report
			total_loss += loss.item()
		
		# Evaluate the model
		validation_loss, validation_accuracy = evaluate_model(Xs_va, Ys_va, model, loss_fn)
		print(f"Epoch {epoch+1}/{epochs}, Training Loss: {loss.item()}, Validation Loss: {round(validation_loss,3)}, Validation Accuracy: {round(validation_accuracy,3)}")


In [53]:
# FFNN
train(Xs_tr, Ys_tr, Xs_va, Ys_va, ffnn, cross_entropy_loss_fn, adam_optimizer, epochs, grad_clip_max_norm)

Epoch 1/10, Training Loss: 0.019268997013568878, Validation Loss: 0.664, Validation Accuracy: 0.871
Epoch 2/10, Training Loss: 0.006506010424345732, Validation Loss: 0.644, Validation Accuracy: 0.871
Epoch 3/10, Training Loss: 0.010621930472552776, Validation Loss: 0.669, Validation Accuracy: 0.873
Epoch 4/10, Training Loss: 0.0071776434779167175, Validation Loss: 0.663, Validation Accuracy: 0.875
Epoch 5/10, Training Loss: 0.0022742196451872587, Validation Loss: 0.681, Validation Accuracy: 0.87
Epoch 6/10, Training Loss: 0.000254071899689734, Validation Loss: 0.639, Validation Accuracy: 0.876
Epoch 7/10, Training Loss: 0.04095885530114174, Validation Loss: 0.664, Validation Accuracy: 0.874
Epoch 8/10, Training Loss: 0.004265537019819021, Validation Loss: 0.668, Validation Accuracy: 0.873
Epoch 9/10, Training Loss: 0.00638449564576149, Validation Loss: 0.648, Validation Accuracy: 0.876
Epoch 10/10, Training Loss: 0.01604512147605419, Validation Loss: 0.691, Validation Accuracy: 0.873


In [27]:
# RNN
train(Xs_tr, Ys_tr, Xs_va, Ys_va, rnn, cross_entropy_loss_fn, adam_optimizer, epochs, grad_clip_max_norm)

Epoch 1/10, Training Loss: 0.7021800875663757, Validation Loss: 0.694, Validation Accuracy: 0.501
Epoch 2/10, Training Loss: 0.6896276473999023, Validation Loss: 0.694, Validation Accuracy: 0.496
Epoch 3/10, Training Loss: 0.6949682235717773, Validation Loss: 0.694, Validation Accuracy: 0.495
Epoch 4/10, Training Loss: 0.6878487467765808, Validation Loss: 0.694, Validation Accuracy: 0.5
Epoch 5/10, Training Loss: 0.6873521208763123, Validation Loss: 0.694, Validation Accuracy: 0.494
Epoch 6/10, Training Loss: 0.6904792785644531, Validation Loss: 0.694, Validation Accuracy: 0.505
Epoch 7/10, Training Loss: 0.6857607364654541, Validation Loss: 0.694, Validation Accuracy: 0.498
Epoch 8/10, Training Loss: 0.6856743693351746, Validation Loss: 0.694, Validation Accuracy: 0.498
Epoch 9/10, Training Loss: 0.6999891400337219, Validation Loss: 0.694, Validation Accuracy: 0.495
Epoch 10/10, Training Loss: 0.6845292448997498, Validation Loss: 0.694, Validation Accuracy: 0.502


In [28]:
# LSTM
train(Xs_tr, Ys_tr, Xs_va, Ys_va, lstm, cross_entropy_loss_fn, adam_optimizer, epochs, grad_clip_max_norm)

Epoch 1/10, Training Loss: 0.6923296451568604, Validation Loss: 0.693, Validation Accuracy: 0.5
Epoch 2/10, Training Loss: 0.6969357132911682, Validation Loss: 0.693, Validation Accuracy: 0.5
Epoch 3/10, Training Loss: 0.685727059841156, Validation Loss: 0.693, Validation Accuracy: 0.5
Epoch 4/10, Training Loss: 0.6915701031684875, Validation Loss: 0.693, Validation Accuracy: 0.5
Epoch 5/10, Training Loss: 0.6898727416992188, Validation Loss: 0.693, Validation Accuracy: 0.5
Epoch 6/10, Training Loss: 0.6936323642730713, Validation Loss: 0.693, Validation Accuracy: 0.5
Epoch 7/10, Training Loss: 0.6918451189994812, Validation Loss: 0.693, Validation Accuracy: 0.5
Epoch 8/10, Training Loss: 0.6927154660224915, Validation Loss: 0.693, Validation Accuracy: 0.5
Epoch 9/10, Training Loss: 0.6960940361022949, Validation Loss: 0.693, Validation Accuracy: 0.5
Epoch 10/10, Training Loss: 0.6906998753547668, Validation Loss: 0.693, Validation Accuracy: 0.5


In [29]:
# # BERT
# train(Xs_tr, Ys_tr, Xs_va, Ys_va, bert, cross_entropy_loss_fn, adam_optimizer, epochs, grad_clip_max_norm)

In [12]:
# # time too long to run
# from sklearn.metrics import classification_report, accuracy_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.svm import SVC

# scaler = StandardScaler()

# # Fit only on training data
# Xs_tr_scaled = scaler.fit_transform(Xs_tr)
# Xs_va_scaled = scaler.transform(Xs_va)

# # Initialize the SVM classifier
# clf = SVC(kernel='linear')

# # Train the classifier
# clf.fit(Xs_tr_scaled, Ys_tr)

# # Predict on the validation set
# Ys_va_pred = clf.predict(Xs_va_scaled)

# # Evaluate the model
# print("Accuracy:", accuracy_score(Ys_va, Ys_va_pred))
# print(classification_report(Ys_va, Ys_va_pred))

# 7. Prediction & Submission

In [54]:
def make_prediction(Xs_te, model):
    Y_preds_prob = model(Xs_te)
    Y_preds = torch.argmax(Y_preds_prob, axis = 1)
    return Y_preds

In [55]:
def make_submission(uid, Y_preds):
    df = pd.DataFrame({'uid': uid, 'preference': Y_preds})
    df.to_csv('submission.csv', index = False)

In [56]:
Xs_te = np.concatenate((test_data['emb1'], test_data['emb2']), axis=1)
Xs_te = torch.Tensor(Xs_te)
Y_preds = make_prediction(Xs_te, ffnn)
make_submission(test_data['uid'], np.array(Y_preds))