In [15]:
import sys
import random
sys.path.append('.')
import dataset

import numpy as np
from tqdm.auto import tqdm, trange

import torch
import torch.nn as nn
import torch.nn.functional as F

np.random.seed(0)

In [16]:
import os
if not os.path.exists("../../data/data-train.csv.0"):
    dataset.create_n_splits("../../data/data-train.csv")

HBox(children=(FloatProgress(value=0.0, max=1176952.0), HTML(value='')))




In [17]:
X_train, X_valid, X_test = dataset.load_datasets()

Caching dataset ../../data/data-train.csv.0 ...


HBox(children=(FloatProgress(value=0.0, max=231392.0), HTML(value='')))


Caching dataset ../../data/data-train.csv.1 ...


HBox(children=(FloatProgress(value=0.0, max=231392.0), HTML(value='')))


Caching dataset ../../data/data-train.csv.2 ...


HBox(children=(FloatProgress(value=0.0, max=231392.0), HTML(value='')))


Caching dataset ../../data/data-train.csv.3 ...


HBox(children=(FloatProgress(value=0.0, max=231391.0), HTML(value='')))


Caching dataset ../../data/sample-submission.csv ...


HBox(children=(FloatProgress(value=0.0, max=1176952.0), HTML(value='')))


Caching dataset ../../data/data-train.csv.4 ...


HBox(children=(FloatProgress(value=0.0, max=251385.0), HTML(value='')))




In [18]:
# To reproduce the final result, set train_on_full_dataset = True:
train_on_full_dataset = True
if train_on_full_dataset:
    X_train = X_train + X_valid

In [19]:
X_train.count_nonzero(), X_valid.count_nonzero(), X_test.count_nonzero()
valid_indices = list(set(zip(X_train.nonzero()[0], X_train.nonzero()[1])))

In [20]:
class Autoencoder(nn.Module):

    def __init__(self):
        super().__init__()
        self.user_dim = 10000
        self.item_dim = 1000
        
        self.layer_1 = nn.Linear(1000, 100)
        self.layer_1a = nn.Linear(100, 100)
        
        self.vae = nn.Linear(100, 100)
        
        self.layer_3 = nn.Linear(50, 100)
        self.cls_layer = nn.Linear(100, 1000)
    
    def forward(self, data):
        net_data = data

        # Encoder layers
        net_data = F.relu(self.layer_1(net_data))
        net_data = F.dropout(net_data, training=self.training)
        
        net_data = F.relu(self.layer_1a(net_data))
        net_data = F.dropout(net_data, training=self.training)
        
        # VAE bottleneck
        vae = self.vae(net_data)
        mus_q, log_sigmas_q = torch.split(vae, 50, dim=-1)
        stds_q = torch.exp(0.5 * log_sigmas_q)
        
        KL = 0.5 * (-log_sigmas_q + torch.exp(log_sigmas_q) + mus_q ** 2 - 1)
        KL = torch.sum(KL, dim=-1)
        KL = torch.mean(KL)
        
        # Sample random value if training,
        # use the mean during evaluation
        if not self.training:
            sampled_z = mus_q
        else:
            eps = torch.randn(stds_q.shape, dtype=torch.float, device=device)
            sampled_z = mus_q + eps * stds_q
        
        # Decoder layers
        net_data = F.relu(self.layer_3(sampled_z))
        net_data = F.dropout(net_data, training=self.training)
        
        # Classification layer
        y_score = self.cls_layer(net_data)
        return y_score, KL

In [21]:
# Model initialization
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
try:
    del model
except:
    pass
model = Autoencoder().to(device)
opt = torch.optim.Adam(model.parameters(), lr=3e-4)

In [22]:
print(X_train.count_nonzero(), X_valid.count_nonzero())

def get_predictions(A):
    """Predict the full matrix using the autoencoder."""
    model.eval()
    A_pred = np.zeros((10000, 1000))
    valid_indices = list(set(zip(A.nonzero()[0], A.nonzero()[1])))
    
    # Split dataset in batches of 64 rows ...
    for i in range(0, 10000, 64):
        X = X_train[i:i+64].todense()
        X_ = torch.tensor(X, dtype=torch.float, device=device)
        y_preds, KL = model(X_)
        
        for i_ in range(y_preds.shape[0]):
            x_i = y_preds[i_].detach().cpu()
            A_pred[i+i_] = x_i
    
    return A_pred, valid_indices

def compute_loss():
    """Compute the mean squared error between the ranking matrix and predictions."""
    A_pred, valid_indices = get_predictions(X_valid)
    losses = np.square(X_valid - A_pred)
    losses = [losses[i,j] for (i, j) in valid_indices]
    mean_loss = np.mean(losses)
    return mean_loss

compute_loss()

1176952 251385


16.10537972623973

In [23]:
# Cache positions of nonempty indices for training

train_indices_perline = [[] for i_ in range(10000)]
for i, j in valid_indices:
    train_indices_perline[i].append((i, j))

for line in train_indices_perline:
    assert line
assert len(train_indices_perline) == 10000

In [24]:
ANNEAL_ALPHA = 0.0
NOISE_SCALE = 0.5
MAX_STEPS = 500000

loss_avg = []

for i in range(0, MAX_STEPS):
    # randomly sample 32 points from training matrix
    model.train()
    random_lines = random.sample(range(0, 10000), k=32)
    X = X_train[random_lines].todense()
    X = torch.tensor(X, dtype=torch.float, device=device)
    y_true = X
    
    opt.zero_grad()
    # Important: Add random noise to the input to prevent overfitting
    y_preds, KL = model(X + torch.randn(X.shape, device=device) * NOISE_SCALE)
    
    valid_mask = torch.zeros_like(y_preds, device=device)
    for i_ in range(X.shape[0]):
        i_real = random_lines[i_]
        indices = [j for _,j in train_indices_perline[i_real]]
        valid_mask[i_, indices] = 1.0

    loss = (y_preds - y_true) ** 2
    loss = loss[valid_mask != 0.]
    neg_ELBO = loss + ANNEAL_ALPHA * KL
    
    # update avg loss
    loss_avg.insert(0, loss.mean().item())
    loss_avg = loss_avg[:1000]
    
    if i % 1000 == 0:
        print("It %d Avg loss: %.4f" % (i, np.mean(loss_avg)))
    if i and i % 1000 == 0:
        mean_loss = compute_loss()
        print("Mean loss: %.4f" % mean_loss)
    
    neg_ELBO.mean().backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
    opt.step()


It 0 Avg loss: 15.6815
It 1000 Avg loss: 4.0484
Mean loss: 2.1270
It 2000 Avg loss: 1.5287
Mean loss: 1.2590
It 3000 Avg loss: 1.2829
Mean loss: 1.0591
It 4000 Avg loss: 1.2354
Mean loss: 1.0506
It 5000 Avg loss: 1.2185
Mean loss: 1.0704
It 6000 Avg loss: 1.2034
Mean loss: 1.0316
It 7000 Avg loss: 1.1931
Mean loss: 1.0621
It 8000 Avg loss: 1.1882
Mean loss: 1.0460
It 9000 Avg loss: 1.1835
Mean loss: 1.0601
It 10000 Avg loss: 1.1718
Mean loss: 1.0352
It 11000 Avg loss: 1.1768
Mean loss: 1.0434
It 12000 Avg loss: 1.1701
Mean loss: 1.0462
It 13000 Avg loss: 1.1630
Mean loss: 1.0385
It 14000 Avg loss: 1.1625
Mean loss: 1.0576
It 15000 Avg loss: 1.1625
Mean loss: 1.0330
It 16000 Avg loss: 1.1559
Mean loss: 1.0418
It 17000 Avg loss: 1.1543
Mean loss: 1.0525
It 18000 Avg loss: 1.1557
Mean loss: 1.0499
It 19000 Avg loss: 1.1472
Mean loss: 1.0339
It 20000 Avg loss: 1.1449
Mean loss: 1.0194
It 21000 Avg loss: 1.1370
Mean loss: 1.0298
It 22000 Avg loss: 1.1397
Mean loss: 1.0309
It 23000 Avg loss:

It 185000 Avg loss: 0.9856
Mean loss: 0.9481
It 186000 Avg loss: 0.9869
Mean loss: 0.9513
It 187000 Avg loss: 0.9862
Mean loss: 0.9480
It 188000 Avg loss: 0.9888
Mean loss: 0.9476
It 189000 Avg loss: 0.9834
Mean loss: 0.9486
It 190000 Avg loss: 0.9837
Mean loss: 0.9464
It 191000 Avg loss: 0.9844
Mean loss: 0.9460
It 192000 Avg loss: 0.9811
Mean loss: 0.9466
It 193000 Avg loss: 0.9825
Mean loss: 0.9494
It 194000 Avg loss: 0.9831
Mean loss: 0.9476
It 195000 Avg loss: 0.9807
Mean loss: 0.9484
It 196000 Avg loss: 0.9835
Mean loss: 0.9501
It 197000 Avg loss: 0.9814
Mean loss: 0.9474
It 198000 Avg loss: 0.9807
Mean loss: 0.9459
It 199000 Avg loss: 0.9826
Mean loss: 0.9452
It 200000 Avg loss: 0.9830
Mean loss: 0.9457
It 201000 Avg loss: 0.9815
Mean loss: 0.9447
It 202000 Avg loss: 0.9821
Mean loss: 0.9468
It 203000 Avg loss: 0.9822
Mean loss: 0.9438
It 204000 Avg loss: 0.9809
Mean loss: 0.9470
It 205000 Avg loss: 0.9817
Mean loss: 0.9443
It 206000 Avg loss: 0.9794
Mean loss: 0.9431
It 207000 

Mean loss: 0.9262
It 368000 Avg loss: 0.9606
Mean loss: 0.9263
It 369000 Avg loss: 0.9603
Mean loss: 0.9266
It 370000 Avg loss: 0.9633
Mean loss: 0.9263
It 371000 Avg loss: 0.9631
Mean loss: 0.9249
It 372000 Avg loss: 0.9625
Mean loss: 0.9292
It 373000 Avg loss: 0.9608
Mean loss: 0.9275
It 374000 Avg loss: 0.9623
Mean loss: 0.9243
It 375000 Avg loss: 0.9594
Mean loss: 0.9262
It 376000 Avg loss: 0.9626
Mean loss: 0.9248
It 377000 Avg loss: 0.9615
Mean loss: 0.9251
It 378000 Avg loss: 0.9587
Mean loss: 0.9269
It 379000 Avg loss: 0.9606
Mean loss: 0.9257
It 380000 Avg loss: 0.9594
Mean loss: 0.9238
It 381000 Avg loss: 0.9610
Mean loss: 0.9264
It 382000 Avg loss: 0.9611
Mean loss: 0.9259
It 383000 Avg loss: 0.9618
Mean loss: 0.9253
It 384000 Avg loss: 0.9614
Mean loss: 0.9241
It 385000 Avg loss: 0.9612
Mean loss: 0.9251
It 386000 Avg loss: 0.9610
Mean loss: 0.9218
It 387000 Avg loss: 0.9601
Mean loss: 0.9233
It 388000 Avg loss: 0.9612
Mean loss: 0.9249
It 389000 Avg loss: 0.9621
Mean loss:

In [25]:
mean_loss = compute_loss()
print(mean_loss)

0.913636902448684


In [26]:
# The predicted matrix can also be used as initialization for other models ...
import pickle
with open("array-for-svd.pkl", "wb") as wp:
    pickle.dump(get_predictions(X_train)[0], wp)

In [None]:
# Submission

In [27]:
import pandas as pd

def export_and_save(target, preds):
    target_rows, target_cols = target.nonzero()
    ids = [f"r{row+1}_c{col+1}" for row, col in zip(target_rows, target_cols)]
    scores = [preds[row, col] for row, col in zip(target_rows, target_cols)]
    # Clip scores out of valid range
    scores = [score if score <= 5.0 else 5.0 for score in scores]
    scores = [score if score >= 1.0 else 1.0 for score in scores]
    df = pd.DataFrame({"Id": ids, "Prediction": scores})
    print("---Please check---")
    print(df.head())
    df.to_csv("../../predictions/vae-1.csv", index=False)

export_and_save(X_test, get_predictions(X_test)[0])

---Please check---
        Id  Prediction
0    r1_c4    3.542510
1    r1_c8    3.368879
2   r1_c21    2.769573
3  r1_c102    3.920032
4  r1_c127    3.026338


In [28]:
# Check whether we exported the array correctly ...
import pickle
with open("array-for-svd.pkl", "rb") as fp:
    arr = pickle.load(fp)
    print(arr.shape, arr[0,7])
    del arr

(10000, 1000) 3.3688793182373047
