# 1. Import

In [711]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from typing import Optional
import itertools

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
from torch.utils.data import DataLoader, TensorDataset

# 2. Data Loading

In [712]:
import numpy as np

def load_npz(file_path):
    with np.load(file_path) as data:
        return {key: data[key] for key in data}

train_data = load_npz(r'.\data\train.npz')
test_data = load_npz(r'.\data\test.npz')
train_emb1, train_emb2, train_labels = train_data['emb1'], train_data['emb2'], train_data['preference']
test_emb1, test_emb2 = test_data['emb1'], test_data['emb2']

# 3. Exploration

In [713]:
train_data

{'uid': array([    0,     1,     2, ..., 18747, 18748, 18749], dtype=int64),
 'emb1': array([[-0.05075016, -0.03491386, -0.05787281, ...,  0.00020284,
          0.02388327, -0.02491781],
        [-0.12402835, -0.07631648, -0.05782915, ...,  0.02713838,
          0.01394665,  0.0186507 ],
        [-0.06794146, -0.0385992 ,  0.04476113, ...,  0.07999779,
          0.04943484,  0.00783883],
        ...,
        [ 0.02096516, -0.00752076, -0.06958353, ...,  0.01346127,
          0.01917063, -0.06059628],
        [-0.00901941,  0.01330765, -0.02343761, ..., -0.02690429,
          0.0084649 ,  0.01999134],
        [-0.05510234,  0.00251053, -0.01775946, ...,  0.00322949,
         -0.02700103,  0.01986161]], dtype=float32),
 'emb2': array([[-0.03255587,  0.01327268, -0.00508326, ..., -0.01196616,
         -0.03564733, -0.03713938],
        [-0.00014027,  0.03904634,  0.0592997 , ...,  0.00117963,
          0.04012304,  0.07394706],
        [-0.068197  , -0.0943828 ,  0.04236921, ...,  0.02259

In [714]:
train_data.keys()

dict_keys(['uid', 'emb1', 'emb2', 'preference'])

In [715]:
# x1
print(train_data['emb1'][0].shape) # (384,)
# x2
print(train_data['emb2'][0].shape) # (384,)
# y
print(train_data['preference'][0]) # 1
# train_data['emb1'][0]

(384,)
(384,)
1


# 4. Preprocessing

In [716]:
## Parameters

# Preprocessing Parameters
validation_size = 0.2
RAND_STATE = 5780
shuffle_split = True
standardized = False

In [717]:
def train_validation_split(Xs, Ys, validation_size: float=0.2):
    Xs_tr, Xs_va, Ys_tr, Ys_va = train_test_split(Xs, Ys, test_size=validation_size, random_state=RAND_STATE, shuffle=shuffle_split, stratify=Ys)
    return torch.Tensor(Xs_tr), torch.Tensor(Xs_va), torch.Tensor(Ys_tr).long(), torch.Tensor(Ys_va).long()

In [718]:
def standardization(Xs):
    scaler = StandardScaler()
    Xs_scaled = scaler.fit_transform(Xs)
    return torch.Tensor(Xs_scaled)

In [719]:
print(train_data['emb1'].shape) # (n x d): (18750, 384)
print(train_data['emb2'].shape) # (n x d): (18750, 384)

# Concatenate the input in to a single long vector
Xs = np.concatenate((train_data['emb1'], train_data['emb2']), axis=1)
Ys = train_data['preference']

# Train Validation Split
Xs_tr, Xs_va, Ys_tr, Ys_va = train_validation_split(Xs, Ys, validation_size)

if standardized:
    Xs_tr = standardization(Xs_tr)
    Xs_va = standardization(Xs_va)

# Convert to Torch
print(f'Xs_tr.shape: {Xs_tr.shape}') 
print(f'Ys_tr.shape: {Ys_tr.shape}')
print(f'Xs_va.shape: {Xs_va.shape}')
print(f'Ys_va.shape: {Ys_va.shape}')

(18750, 384)
(18750, 384)
Xs_tr.shape: torch.Size([15000, 768])
Ys_tr.shape: torch.Size([15000])
Xs_va.shape: torch.Size([3750, 768])
Ys_va.shape: torch.Size([3750])


# 5. Model

In [720]:
# Parameters
embedding_dim = 768
hidden_dim = 128
output_dim = 2
num_layers = 1
activation = "relu"

# Improvement
dropout_rate = 0.5
include_batch_norm = True
initialize_weights = False

In [721]:
# FFNN Model
class FFNN(nn.Module):
    def __init__(
        self, 
        embedding_dim: int, 
        hidden_dim: int,
        output_dim: int,
        activation: str = "relu",
        num_layers: int = 1,
        include_batch_norm: bool = False,
        initialize_weights: bool = False,
        dropout_rate: Optional[float] = None
    ) -> None:
        
        super().__init__()
        assert num_layers > 0

        # FFNN architecture attributes
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.activation = activation
        self.num_layers = num_layers

        # Layer attributes
        self.input_layer = nn.Linear(self.embedding_dim, self.hidden_dim)
        self.hidden_layers = nn.ModuleList()
        for _ in range(self.num_layers - 1):
            self.hidden_layers.append(nn.Linear(self.hidden_dim, self.hidden_dim))
        self.output_layer = nn.Linear(self.hidden_dim, self.output_dim)

        # Weight initialization attributes
        self.initialize_weights = initialize_weights
        if initialize_weights:
            init.xavier_normal_(self.input_layer.weight)
            for hidden_layer in self.hidden_layers:
                init.xavier_normal_(hidden_layer.weight)
            init.xavier_normal_(self.output_layer.weight)

        # FFNN performance improvement attributes
        self.dropout_rate = dropout_rate
        if dropout_rate is not None:
            self.dropout = nn.Dropout(p=self.dropout_rate)
        else:
            self.dropout = None
        self.include_batch_norm = include_batch_norm
        if include_batch_norm:
            self.batch_norm = nn.BatchNorm1d(self.hidden_dim)

    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
        x = self.input_layer(embeddings)
        for hidden_layer in self.hidden_layers:
            # Forward layer
            x = hidden_layer(x)

            # Batch normalization layer
            if self.include_batch_norm:
                x = self.batch_norm(x)

            # Non-linear layer
            if self.activation == "relu":
                x = F.relu(x)
            elif self.activation == "tanh":
                x = F.tanh(x)
            elif self.activation == "sigmoid":
                x = F.sigmoid(x)

            # Drop out regularization layer
            if self.dropout_rate is not None:
                x = self.dropout(x)
        output = self.output_layer(x)
        return output

In [722]:
# Test
ffnn = FFNN(
    embedding_dim=embedding_dim, 
    hidden_dim=hidden_dim,
    output_dim=output_dim,
    activation=activation,
    num_layers=num_layers, 
    include_batch_norm=include_batch_norm,
    initialize_weights=initialize_weights,
)
ffnn

FFNN(
  (input_layer): Linear(in_features=768, out_features=128, bias=True)
  (hidden_layers): ModuleList()
  (output_layer): Linear(in_features=128, out_features=2, bias=True)
  (batch_norm): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

# 6. Model Training

In [723]:
# Parameters
epochs = 10
batch_size = 100
alpha = 0.1
beta = 0.9
rho1 = 0.99
rho2 = 0.999
# grad_clip_max_norm = 1

# Optimizers
sgd_optimizer = torch.optim.SGD(ffnn.parameters(), lr=alpha)
adam_optimizer = torch.optim.Adam(ffnn.parameters(), lr=alpha)
adamw_optimizer = torch.optim.AdamW(ffnn.parameters(), lr=alpha)
rmsprop_optimizer = torch.optim.RMSprop(ffnn.parameters(), lr=alpha)

# Loss functions
binary_cross_entropy_loss_fn = torch.nn.BCELoss()
cross_entropy_loss_fn = torch.nn.CrossEntropyLoss()

In [724]:
# evaluate a trained model on MNIST data
#
# dataloader    dataloader of examples to evaluate on
# model         trained PyTorch model
# loss_fn       loss function (e.g. torch.nn.CrossEntropyLoss)
#
# returns       tuple of (loss, accuracy), both python floats
@torch.no_grad()
def evaluate_model(Xs_va, Ys_va, model, loss_fn):
	model.eval()
	total_loss = 0.0
	total_correct = 0
	total_samples = 0

	# Create DataLoader for batching
	validation_dataset = TensorDataset(Xs_va, Ys_va)
	validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)	

	for X, Y in validation_loader:
		Y_pred_prob = model(X)
		loss = loss_fn(Y_pred_prob, Y)
		total_loss += loss.item()
	
		Y_pred = torch.argmax(Y_pred_prob, dim=1)
		total_correct += torch.sum(Y_pred == Y).item()
		total_samples += Y.size(0)
	
	average_loss = total_loss / len(validation_loader)
	accuracy = total_correct / total_samples
	
	return average_loss, accuracy

In [725]:
def train(Xs_tr, Ys_tr, Xs_va, Ys_va, model, loss_fn, optimizer, epochs, batch_size, grad_clip_max_norm: Optional[float] = None):
	validation_losses = []
	validation_accuracies = []

	# Create DataLoader for batching
	train_dataset = TensorDataset(Xs_tr, Ys_tr)
	train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
	
	for epoch in range(epochs):
		# Set to training mode
		model.train()
		
		for i, (X, Y) in enumerate(train_loader):
			total_loss = 0.0

			# Zero gradients for every batch
			optimizer.zero_grad()

			# Make predictions for this batch
			Y_pred_prob = model(X)

			# Compute the loss and its gradients
			loss = loss_fn(Y_pred_prob, Y)
			loss.backward()

			if grad_clip_max_norm is not None:
				nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_max_norm)

			# Adjust learning weights
			optimizer.step()

			# Gather data and report
			total_loss += loss.item()
		
		# Evaluate the model
		validation_loss, validation_accuracy = evaluate_model(Xs_va, Ys_va, model, loss_fn)
		validation_losses.append(validation_loss)
		validation_accuracies.append(validation_accuracy)
		# print(f"Epoch {epoch+1}/{epochs}, Training Loss: {loss.item()}, Validation Loss: {round(validation_loss,3)}, Validation Accuracy: {round(validation_accuracy,3)}")
	
	best_validation_loss = min(validation_losses)
	best_validation_accuracy = max(validation_accuracies)
	# print(f"Minimum Loss: {min_validation_loss}, Max Accuracy: {max_validation_accuracy}")
	return best_validation_loss, best_validation_accuracy


In [727]:
# FFNN
train(
    Xs_tr, 
    Ys_tr, 
    Xs_va, 
    Ys_va, 
    ffnn, 
    cross_entropy_loss_fn, 
    adam_optimizer, 
    epochs,
    batch_size,
    grad_clip_max_norm = None
)

(0.2736740257394941, 0.8877333333333334)

# 7. Hyperparameter Tuning

In [None]:
# Architecture Parameters
embedding_dim = 768
hidden_dim = 128
output_dim = 2
num_layers = 1
activation = "relu"

# Architecture Improvement Parameters
dropout_rate = 0.5
include_batch_norm = True
initialize_weights = False

# Training Parameters
epochs = 10
batch_size = 100
alpha = 0.1
beta = 0.9
rho1 = 0.99
rho2 = 0.999
grad_clip_max_norm = 1

# Optimizers
sgd_optimizer = torch.optim.SGD(ffnn.parameters(), lr=alpha)
adam_optimizer = torch.optim.Adam(ffnn.parameters(), lr=alpha)
adamw_optimizer = torch.optim.AdamW(ffnn.parameters(), lr=alpha)
rmsprop_optimizer = torch.optim.RMSprop(ffnn.parameters(), lr=alpha)

# Loss functions
cross_entropy_loss_fn = torch.nn.CrossEntropyLoss()

In [698]:
# FFNN
param_grid = {
    'hidden_dims': [32, 64, 128, 256, 512],
    'activations': ["relu", "tanh", "sigmoid"],
    'num_layers': [1, 2, 3, 4, 5],
    'include_batch_norm': [True, False],
    'initialize_weights': [True, False],
    'dropout_rates': [None, 0.1, 0.2, 0.3, 0.4, 0.5],
    'batch_sizes': [64, 100, 128, 256, 512],
    'grad_clip_max_norms': [None, 1, 2, 3, 4, 5, 6]
}

grid_search_combinations = list(itertools.product(*param_grid.values()))

results = []
for (
    hidden_dim, 
    activation, 
    num_layer, 
    include_batch_norm, 
    initialize_weights, 
    dropout_rate, 
    batch_size, 
    grad_clip_max_norm
    ) in grid_search_combinations:
    
    # FFNN Architecture
    ffnn = FFNN(
        embedding_dim=embedding_dim, 
        hidden_dim=hidden_dim,
        output_dim=output_dim, 
        num_layers=num_layers, 
        include_batch_norm=include_batch_norm,
        initialize_weights=initialize_weights,
        dropout_rate=dropout_rate
    )

    # Optimizer
    adam_optimizer = torch.optim.Adam(ffnn.parameters(), lr=alpha)  

    # Training
    best_validation_loss, best_validation_accuracy = train(
        Xs_tr, 
        Ys_tr, 
        Xs_va, 
        Ys_va, 
        ffnn, 
        cross_entropy_loss_fn, 
        adam_optimizer, 
        epochs,
        batch_size,
        grad_clip_max_norm
    )

    # Result
    result = dict(
        zip(
            param_grid.keys(), 
            (hidden_dim, activation, num_layer, include_batch_norm, initialize_weights, dropout_rate)
        )
    )
    result["best_validation_loss"] = best_validation_loss
    result["best_validation_accuracy"] = best_validation_accuracy
    results.append(result)

100%|██████████| 10/10 [00:03<00:00,  2.87it/s]
100%|██████████| 10/10 [00:03<00:00,  2.89it/s]
100%|██████████| 10/10 [00:03<00:00,  2.79it/s]
100%|██████████| 10/10 [00:03<00:00,  2.74it/s]
100%|██████████| 10/10 [00:03<00:00,  2.72it/s]
100%|██████████| 10/10 [00:03<00:00,  2.91it/s]
100%|██████████| 10/10 [00:03<00:00,  2.84it/s]
100%|██████████| 10/10 [00:03<00:00,  2.82it/s]
100%|██████████| 10/10 [00:03<00:00,  2.76it/s]
100%|██████████| 10/10 [00:03<00:00,  2.94it/s]
100%|██████████| 10/10 [00:03<00:00,  2.92it/s]
100%|██████████| 10/10 [00:03<00:00,  2.93it/s]
100%|██████████| 10/10 [00:03<00:00,  2.91it/s]
100%|██████████| 10/10 [00:03<00:00,  2.89it/s]
100%|██████████| 10/10 [00:03<00:00,  2.94it/s]
100%|██████████| 10/10 [00:03<00:00,  2.92it/s]
100%|██████████| 10/10 [00:03<00:00,  2.66it/s]
100%|██████████| 10/10 [00:03<00:00,  2.78it/s]
100%|██████████| 10/10 [00:03<00:00,  2.82it/s]
100%|██████████| 10/10 [00:03<00:00,  2.88it/s]
100%|██████████| 10/10 [00:03<00:00,  2.

KeyboardInterrupt: 

In [699]:
result_df = pd.DataFrame(results)
result_df

Unnamed: 0,hidden_dims,activations,num_layers,include_batch_norm,initialize_weights,dropout_rates,best_validation_loss,best_validation_accuracy
0,32,relu,1,True,True,0.1,0.691413,0.524267
1,32,relu,1,True,True,0.2,0.694580,0.513333
2,32,relu,1,True,True,0.3,0.704639,0.457067
3,32,relu,1,True,True,0.4,0.692862,0.512800
4,32,relu,1,True,True,0.5,0.697453,0.500800
...,...,...,...,...,...,...,...,...
2462,1024,sigmoid,1,False,True,0.3,0.692566,0.511200
2463,1024,sigmoid,1,False,True,0.4,0.694075,0.499467
2464,1024,sigmoid,1,False,True,0.5,0.697679,0.483200
2465,1024,sigmoid,1,False,True,0.6,0.696693,0.486933


In [700]:
result_df.loc[result_df.best_validation_accuracy == result_df.best_validation_accuracy.max(), :]

Unnamed: 0,hidden_dims,activations,num_layers,include_batch_norm,initialize_weights,dropout_rates,best_validation_loss,best_validation_accuracy
1576,256,tanh,6,False,True,0.5,0.680581,0.6096


In [701]:
result_df.best_validation_accuracy.max()

0.6096

In [710]:
result_df.to_csv("architecture_result.csv")

# 8. Submission

In [368]:
def make_prediction(Xs_te, model):
    Y_preds_prob = model(Xs_te)
    Y_preds = torch.argmax(Y_preds_prob, axis = 1)
    return Y_preds

In [369]:
def make_submission(uid, Y_preds):
    df = pd.DataFrame({'uid': uid, 'preference': Y_preds})
    df.to_csv('submission.csv', index = False)

In [370]:
Xs_te = np.concatenate((test_data['emb1'], test_data['emb2']), axis=1)
Xs_te = torch.Tensor(Xs_te)
Y_preds = make_prediction(Xs_te, ffnn)
make_submission(test_data['uid'], np.array(Y_preds))

# 8. Appendix

In [502]:
class RNN(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.hidden_layers = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Forward pass through the RNN layer
        out, _ = self.hidden_layers(x)

        # Take the output from the last time step and pass it through the fully connected layer
        out = self.output_layer(out)
        return out

rnn = RNN(embedding_dim, hidden_dim, output_dim, num_layers)
rnn

In [None]:
# RNN
train(Xs_tr, Ys_tr, Xs_va, Ys_va, rnn, cross_entropy_loss_fn, adam_optimizer, epochs, grad_clip_max_norm)

In [None]:
class LSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers):
        super().__init__()
        self.hidden_layers = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Forward pass through the RNN layer
        out, _ = self.hidden_layers(x)

        # Take the output from the last time step and pass it through the fully connected layer
        out = self.output_layer(out)
        return out

lstm = LSTM(embedding_dim, hidden_dim, output_dim, num_layers)
lstm

In [None]:
# LSTM
train(Xs_tr, Ys_tr, Xs_va, Ys_va, lstm, cross_entropy_loss_fn, adam_optimizer, epochs, grad_clip_max_norm)