# Test repository in the building process

In [1]:
import torch
from torch import nn
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

### 1. Preprocessing

In [2]:
from load import load_data
train_df, submission_df = load_data()

In [3]:
from preprocess import preprocess

A_tilde, standardized_train_ratings, train_users, train_items, means, stds, val_users, val_items, orig_val_ratings, standardized_val_ratings, submission_users, submission_items = preprocess((train_df, submission_df))

### 2. Train

In [9]:
# Model and optimizer hyperparameters
L=1
K=30
INIT_EMBS_STD=0.025
LR=0.05
WEIGHT_DECAY=1e-04
DROPOUT=0.5
NUM_HEADS=3

# Train loop hyperparameters
EPOCHS=10
STOP_THRESHOLD=1e-06

In [12]:
import models 
import importlib 
importlib.reload(models)
from models import ConcatNonLinear
from config import DEVICE

model = ConcatNonLinear(A_tilde, K, L, INIT_EMBS_STD, DROPOUT, NUM_HEADS).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
loss_fn = nn.MSELoss()

In [13]:
from train import train_model
# model, optimizer, loss_fn, train_users, train_items, train_ratings, val_users, val_items, val_ratings, n_epochs, improvement_threshold) -> tuple[list, list]:
train_rmse, val_rmse_std, val_rmse_orig = train_model(model, optimizer, loss_fn, train_users, train_items, standardized_train_ratings, val_users, val_items, orig_val_ratings, standardized_val_ratings, means, stds, EPOCHS, STOP_THRESHOLD, True, hyper_verbose=True)

  0 / 10
Wha1.shape=torch.Size([11000, 1])
Wha2.shape=torch.Size([11000, 1])
e.shape=torch.Size([11000, 11000])
Wha1.shape=torch.Size([11000, 1])
Wha2.shape=torch.Size([11000, 1])
e.shape=torch.Size([11000, 11000])
Wha1.shape=torch.Size([11000, 1])
Wha2.shape=torch.Size([11000, 1])
e.shape=torch.Size([11000, 11000])
Wha1.shape=torch.Size([11000, 1])
Wha2.shape=torch.Size([11000, 1])
e.shape=torch.Size([11000, 11000])
Wha1.shape=torch.Size([11000, 1])
Wha2.shape=torch.Size([11000, 1])
e.shape=torch.Size([11000, 11000])
Wha1.shape=torch.Size([11000, 1])
Wha2.shape=torch.Size([11000, 1])
e.shape=torch.Size([11000, 11000])
Wha1.shape=torch.Size([11000, 1])
Wha2.shape=torch.Size([11000, 1])
e.shape=torch.Size([11000, 11000])
Wha1.shape=torch.Size([11000, 1])
Wha2.shape=torch.Size([11000, 1])
e.shape=torch.Size([11000, 11000])
Wha1.shape=torch.Size([11000, 1])
Wha2.shape=torch.Size([11000, 1])
e.shape=torch.Size([11000, 11000])
Epoch 0 - Train loss: 1.0277 - Val loss standardized: 1.0319 - V

### 3. Check training results

In [None]:
# Training stats

print("Min training loss:", min(train_rmse))
print("Min validation loss:", min(val_rmse_std))
print("Min validation loss:", min(val_rmse_orig))
print("Min validation loss at epoch:", val_rmse_std.index(min(val_rmse_std)))

# Replace values above 10 with 10 in the rmse lists
train_rmse_plot = [min(1, x) for x in train_rmse]
val_rmse_std_plot = [min(1, x) for x in val_rmse_std]
val_rmse_orig_plot = [min(1, x) for x in val_rmse_orig]

# Plot train and val rmse
plt.plot(train_rmse_plot, label='train')
plt.plot(val_rmse_std_plot, label='val std')
plt.plot(val_rmse_orig_plot, label='val orig')
plt.plot()
# annotate min val loss
plt.annotate(round(min(val_rmse_orig_plot), 4), (val_rmse_orig_plot.index(min(val_rmse_orig_plot)), min(val_rmse_orig_plot)), textcoords="offset points", xytext=(0,-10), ha='center')
plt.legend()
plt.show()

### 4. Post-processing

In [None]:
# Read model that achieved best validation loss
model.eval()
model.load_state_dict(torch.load("../data/logs/best_val_model.pth"))

# Get predictions for submission
final_ratings = model.get_ratings(submission_users, submission_items).cpu().detach().numpy()

In [None]:
# Check min and max of final_ratings
print("min:", final_ratings.min())
print("max:", final_ratings.max())
print("mean:", final_ratings.mean())

# Check distribution of final_ratings
plt.hist(final_ratings.flatten(), bins=100)
plt.show()

In [None]:
from config import N_u, N_v
# Reverse standardization

# Fill matrix with predictions
final_ratings_matrix = np.zeros((N_u, N_v))
final_ratings_matrix[submission_users, submission_items] = final_ratings

# Reverse standardization (no mask needed)
def reverse_standardization(submission_matrix, means, stds):
    # shape of all inputs: (n_users, n_items)
    reversed_ratings = submission_matrix * stds + means
    return reversed_ratings

final_ratings_matrix = reverse_standardization(final_ratings_matrix, means, stds)

# extract numpy array from ratings matrix
final_ratings = final_ratings_matrix[submission_users, submission_items]

In [None]:
# Clip at 1 and 5

# check min and max of final_ratings_rounded
print("min:", final_ratings.min().item())
print("max:", final_ratings.max().item())
print("mean:", final_ratings.mean().item())

# Count the number of values under 1 and over 5
count_under_1 = (final_ratings < 1).sum().item()
count_over_5 = (final_ratings > 5).sum().item()
print("count_over_5:", count_over_5)
print("count_under_1:", count_under_1)

# Clip the values to be within the range [1, 5]
final_ratings = np.clip(final_ratings, 1, 5)

In [None]:
# check min and max of final_ratings
print("min:", final_ratings.min().item())
print("max:", final_ratings.max().item())
print("mean:", final_ratings.mean().item())

# check distribution of final_ratings
plt.hist(final_ratings.flatten(), bins=100)
plt.show()

### 5. Submission

In [None]:
# save submission
def to_submission_format(users, movies, predictions):
    return pd.DataFrame(data={'Id': ['r{}_c{}'.format(user + 1, movie + 1) for user, movie in zip(users, movies)],
                              'Prediction': predictions})

submission = to_submission_format(submission_users, submission_items, final_ratings)

submission.to_csv('../data/submission_data/submission.csv', index=False)