In [1]:
import numpy as np 
import torch 
import pandas as pd 
from src.models.CSE.model import Model
from src.models.autoencoder import AutoEncoder

In [2]:
def get_model(args, model_name: str):
    if model_name == 'Contrastive_Pretraining':
        return Model(args['embedding_dim'])
    elif model_name == 'AutoEncoder':
        return AutoEncoder(**args)
    else:
        raise Exception("Choose valid model in config.py")

In [29]:
def inference_for_submission(model, data):
    model.eval()
    with torch.no_grad():
        predictions = model(data)
    predictions = predictions.cpu().numpy()
    return predictions

In [22]:
def read_data(train_path, impute_value=np.nan, number_of_users=1000, number_of_movies=10000):
    data_pd = pd.read_csv(train_path) 
    movies, users = [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    # Create data matrix
    data = np.full((number_of_users , number_of_movies), impute_value)
    for user, movie , pred in zip(users, movies, predictions): 
        data[user][movie] = pred
    return data.T

In [23]:
model = get_model({
        'lr': 1e-6, 
        'batch_size': 32,
        'epochs': 500,
        'hidden_dims': [512, 256, 128],
        'file_path': 'data/raw/data_train.csv',
    }, 'AutoEncoder').load_from_checkpoint("/home/ubuntu/projects/CILProject22/reports/logs/20220715-191150_AutoEncoder/epoch=499-step=125000.ckpt")
data = torch.Tensor(read_data('data/raw/data_train.csv'))

In [30]:
data = torch.nan_to_num(data, nan=0)
pred = inference_for_submission(model, data)
print(pred.shape)


(10000, 1000)


In [31]:
def clip_data(data, clip_high=5, clip_low=1):
    data[data > clip_high] = clip_high
    data[data < clip_low] = clip_low
    return data

def write_submission(data, submission_file_path, save_path):
    # clip data first 
    data = clip_data(data)
    # write submission
    data_pd = pd.read_csv(submission_file_path) 
    test_users, test_movies = [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    with open(save_path, 'w') as f: 
        f.write('Id,Prediction\n')
        for (user, movie) in zip(test_users, test_movies): 
            f.write("r{}_c{},{}\n".format(user + 1, movie + 1, data[user, movie]))

In [36]:
write_submission(pred, '/home/ubuntu/projects/CILProject22/data/submission/sampleSubmission.csv', '/home/ubuntu/projects/CILProject22/data/submission/submission_test.csv')

In [81]:
import torch 
import numpy as np 
import math

In [340]:
def compute_bias(predictions, data, correction='l1'):
    """
    Minimize per user over the known (non-nan) entries and obtain a constant bias per user 
    """
    predictions = torch.Tensor(predictions)
    data = torch.Tensor(data)
    # compute non nan mask of data 
    non_nan_mask = torch.isnan(data) == False
    user_bias_mse = torch.zeros(data.shape[0])
    user_bias_l1 = torch.zeros(data.shape[0])
    for user in range(data.shape[0]):
        user_data = data[user]
        user_pred = predictions[user]
        user_mask = non_nan_mask[user]
        user_mse = (user_data[user_mask] - user_pred[user_mask]) ** 2
        user_l1 = user_data[user_mask] - user_pred[user_mask]
        # compute the biases 
        user_bias_mse[user] = torch.sign(torch.mean(user_l1)) * math.sqrt(torch.mean(user_mse))
        user_bias_l1[user] = torch.mean(user_l1)
        #print(user_data, user_pred, user_mask)
        #print("user mse", user_mse)
        #print("user l1", user_l1)
    if correction == 'l1':
        return predictions + user_bias_l1.unsqueeze(1)
    elif correction == 'mse':
        return predictions + user_bias_mse.unsqueeze(1)
    else:
        raise ValueError("Unknown correction method")

In [341]:
preds = torch.Tensor([
    [2, 2, 2], 
    [2, 2, 2]
    ])
data = torch.Tensor([
    [1, np.nan, 99], 
    [np.nan, 4, 100]
    ])

# print mse before 
print(f"mse loss where not nan is {mse_where_not_nan(preds, data)}")

mse loss where not nan is 4754.5


In [6]:
import numpy as np 
import torch 
import pandas as pd 

In [7]:
def read_data(train_path, impute_value=np.nan, number_of_users=10000, number_of_movies=1000):
    data_pd = pd.read_csv(train_path) 
    users, movies = [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    # Create data matrix
    data = np.full((number_of_users , number_of_movies), impute_value)
    for user, movie ,pred in zip(users, movies, predictions): 
        data[user][movie] = pred
    return data 

In [8]:
data_matrix = torch.Tensor(read_data('../data/raw' + f"/data_train.csv")) # inference on the whole matrix 

In [11]:
data_matrix

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., 5., 3., 3.],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, 3.]])

In [10]:
torch.nan_to_num(data_matrix, nan=0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 5., 3., 3.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 3.]])

In [14]:
~torch.isnan(data_matrix)

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ...,  True,  True,  True],
        [False, False, False,  ..., False, False, False],
        ...,
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False,  True]])

In [10]:
import numpy as np 

# load from csv
data = np.loadtxt('/home/ubuntu/projects/CILProject22/val_mask_0.csv', delimiter=',', dtype=float)

In [11]:
data.shape

(10000, 1000)

In [2]:
from train_nvidia import get_dataloaders

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
train_loader, valid_loader, data_matrix = get_dataloaders('./data/raw', split_number=1, batch_size=32)


In [26]:
import torch 

def remove_user_bias(gt, pred):
    """
    Minimize per user over the known (non-nan) entries and obtain a constant bias per user 
    """
    gt = torch.Tensor(gt)
    pred = torch.Tensor(pred)
    user_residual = torch.zeros(gt.shape[0])
    non_nan_mask = gt != 0 
    user_bias_mse = torch.zeros(gt.shape[0])
    for i, user in enumerate(range(gt.shape[0])):
        user_data = gt[i]
        user_pred = pred[i]
        user_mask = non_nan_mask[i]
        # collapse to entries that are non-zero
        user_data = user_data[user_mask]
        user_pred = user_pred[user_mask]
        user_residual = (user_data - user_pred).mean()
        user_bias_mse[i] = user_residual
    # remove bias from pred matrix 
    pred_corrected =  pred + user_bias_mse.unsqueeze(1)
    return pred_corrected, user_bias_mse

In [27]:
gt = torch.Tensor([
    [0, 2, 2],
    [2, 0, 2],
    [2, 2, 0]
    ])
pred = torch.Tensor([
    [1, 1, 1],
    [3, 3, 3],
    [4, 4, 4]
    ])
corrected, bias = remove_user_bias(gt, pred)
print(f"original matrix \n{pred}, \ncorrected matrix is \n{corrected}, \nuser bias \n{bias}")

original matrix 
tensor([[1., 1., 1.],
        [3., 3., 3.],
        [4., 4., 4.]]), 
corrected matrix is 
tensor([[2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.]]), 
user bias 
tensor([ 1., -1., -2.])
