In [None]:
import sys
import random
sys.path.append('../')
import src.dataset

import numpy as np
from tqdm.auto import tqdm, trange

import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
X_train, X_valid, X_test = src.dataset.load_datasets()

In [None]:
# For final submission:
train_on_full_dataset = False
if train_on_full_dataset:
    X_train = X_train + X_valid

In [None]:
X_train.count_nonzero(), X_valid.count_nonzero(), X_test.count_nonzero()
valid_indices = list(set(zip(X_train.nonzero()[0], X_train.nonzero()[1])))

In [None]:
class Autoencoder(nn.Module):

    def __init__(self):
        super().__init__()
        self.user_dim = 10000
        self.item_dim = 1000
        
        self.layer_1 = nn.Linear(1000, 100)
        self.layer_1a = nn.Linear(100, 100)
        
        self.vae = nn.Linear(100, 100)
        
        self.layer_3 = nn.Linear(50, 100)
        self.cls_layer = nn.Linear(100, 1000)
    
    def forward(self, data):
        net_data = data

        # Encoder layers
        net_data = F.relu(self.layer_1(net_data))
        net_data = F.dropout(net_data, training=self.training)
        
        net_data = F.relu(self.layer_1a(net_data))
        net_data = F.dropout(net_data, training=self.training)
        
        # VAE bottleneck
        vae = self.vae(net_data)
        mus_q, log_sigmas_q = torch.split(vae, 50, dim=-1)
        stds_q = torch.exp(0.5 * log_sigmas_q)
        
        KL = 0.5 * (-log_sigmas_q + torch.exp(log_sigmas_q) + mus_q ** 2 - 1)
        KL = torch.sum(KL, dim=-1)
        KL = torch.mean(KL)
        
        # Sample random value if training,
        # use the mean during evaluation
        if not self.training:
            sampled_z = mus_q
        else:
            eps = torch.randn(stds_q.shape, dtype=torch.float, device=device)
            sampled_z = mus_q + eps * stds_q
        
        # Decoder layers
        net_data = F.relu(self.layer_3(sampled_z))
        net_data = F.dropout(net_data, training=self.training)
        
        # Classification layer
        y_score = self.cls_layer(net_data)
        return y_score, KL

In [None]:
# Model initialization
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
try:
    del model
except:
    pass
model = Autoencoder().to(device)
opt = torch.optim.Adam(model.parameters(), lr=3e-4)

In [None]:
print(X_train.count_nonzero(), X_valid.count_nonzero())

def get_predictions(A):
    """Predict the full matrix using the autoencoder."""
    model.eval()
    A_pred = np.zeros((10000, 1000))
    valid_indices = list(set(zip(A.nonzero()[0], A.nonzero()[1])))
    
    # Split dataset in batches of 64 rows ...
    for i in range(0, 10000, 64):
        X = X_train[i:i+64].todense()
        X_ = torch.tensor(X, dtype=torch.float, device=device)
        y_preds, KL = model(X_)
        
        for i_ in range(y_preds.shape[0]):
            x_i = y_preds[i_].detach().cpu()
            A_pred[i+i_] = x_i
    
    return A_pred, valid_indices

def compute_loss():
    """Compute the mean squared error between the ranking matrix and predictions."""
    A_pred, valid_indices = get_predictions(X_valid)
    losses = np.square(X_valid - A_pred)
    losses = [losses[i,j] for (i, j) in valid_indices]
    mean_loss = np.mean(losses)
    return mean_loss

compute_loss()

In [None]:
# Cache positions of nonempty indices for training

train_indices_perline = [[] for i_ in range(10000)]
for i, j in valid_indices:
    train_indices_perline[i].append((i, j))

for line in train_indices_perline:
    assert line
assert len(train_indices_perline) == 10000

In [None]:
ANNEAL_ALPHA = 0.0
NOISE_SCALE = 0.5
MAX_STEPS = 500000

loss_avg = []

for i in range(0, MAX_STEPS):
    # randomly sample 32 points from training matrix
    model.train()
    random_lines = random.sample(range(0, 10000), k=32)
    X = X_train[random_lines].todense()
    X = torch.tensor(X, dtype=torch.float, device=device)
    y_true = X
    
    opt.zero_grad()
    # Important: Add random noise to the input to prevent overfitting
    y_preds, KL = model(X + torch.randn(X.shape, device=device) * NOISE_SCALE)
    
    valid_mask = torch.zeros_like(y_preds, device=device)
    for i_ in range(X.shape[0]):
        i_real = random_lines[i_]
        indices = [j for _,j in train_indices_perline[i_real]]
        valid_mask[i_, indices] = 1.0

    loss = (y_preds - y_true) ** 2
    loss = loss[valid_mask != 0.]
    neg_ELBO = loss + ANNEAL_ALPHA * KL
    
    # update avg loss
    loss_avg.insert(0, loss.mean().item())
    loss_avg = loss_avg[:1000]
    
    if i % 1000 == 0:
        print("It %d Avg loss: %.4f" % (i, np.mean(loss_avg)))
    if i and i % 1000 == 0:
        mean_loss = compute_loss()
        print("Mean loss: %.4f" % mean_loss)
    
    neg_ELBO.mean().backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
    opt.step()


In [None]:
mean_loss = compute_loss()
print(mean_loss)

In [None]:
# The predicted matrix can also be used as initialization for other models ...
import pickle
with open("array-for-svd.pkl", "wb") as wp:
    pickle.dump(get_predictions(X_train)[0], wp)

In [None]:
# Submission

In [None]:
import pandas as pd

def export_and_save(target, preds):
    target_rows, target_cols = target.nonzero()
    ids = [f"r{row+1}_c{col+1}" for row, col in zip(target_rows, target_cols)]
    scores = [preds[row, col] for row, col in zip(target_rows, target_cols)]
    # Clip scores out of valid range
    scores = [score if score <= 5.0 else 5.0 for score in scores]
    scores = [score if score >= 1.0 else 1.0 for score in scores]
    df = pd.DataFrame({"Id": ids, "Prediction": scores})
    print("---Please check---")
    print(df.head())
    df.to_csv("preds.csv", index=False)

export_and_save(X_test, get_predictions(X_test)[0])

In [None]:
# Check whether we exported the array correctly ...
import pickle
with open("array-for-svd.pkl", "rb") as fp:
    arr = pickle.load(fp)
    print(arr.shape, arr[0,7])
    del arr