In [1]:
import numpy as np 
import torch 
import pandas as pd 
from src.models.CSE.model import Model
from src.models.autoencoder import AutoEncoder

In [2]:
def get_model(args, model_name: str):
    if model_name == 'Contrastive_Pretraining':
        return Model(args['embedding_dim'])
    elif model_name == 'AutoEncoder':
        return AutoEncoder(**args)
    else:
        raise Exception("Choose valid model in config.py")

In [29]:
def inference_for_submission(model, data):
    model.eval()
    with torch.no_grad():
        predictions = model(data)
    predictions = predictions.cpu().numpy()
    return predictions

In [22]:
def read_data(train_path, impute_value=np.nan, number_of_users=1000, number_of_movies=10000):
    data_pd = pd.read_csv(train_path) 
    movies, users = [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    # Create data matrix
    data = np.full((number_of_users , number_of_movies), impute_value)
    for user, movie , pred in zip(users, movies, predictions): 
        data[user][movie] = pred
    return data.T

In [23]:
model = get_model({
        'lr': 1e-6, 
        'batch_size': 32,
        'epochs': 500,
        'hidden_dims': [512, 256, 128],
        'file_path': 'data/raw/data_train.csv',
    }, 'AutoEncoder').load_from_checkpoint("/home/ubuntu/projects/CILProject22/reports/logs/20220715-191150_AutoEncoder/epoch=499-step=125000.ckpt")
data = torch.Tensor(read_data('data/raw/data_train.csv'))

In [30]:
data = torch.nan_to_num(data, nan=0)
pred = inference_for_submission(model, data)
print(pred.shape)


(10000, 1000)


In [31]:
def clip_data(data, clip_high=5, clip_low=1):
    data[data > clip_high] = clip_high
    data[data < clip_low] = clip_low
    return data

def write_submission(data, submission_file_path, save_path):
    # clip data first 
    data = clip_data(data)
    # write submission
    data_pd = pd.read_csv(submission_file_path) 
    test_users, test_movies = [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    with open(save_path, 'w') as f: 
        f.write('Id,Prediction\n')
        for (user, movie) in zip(test_users, test_movies): 
            f.write("r{}_c{},{}\n".format(user + 1, movie + 1, data[user, movie]))

In [36]:
write_submission(pred, '/home/ubuntu/projects/CILProject22/data/submission/sampleSubmission.csv', '/home/ubuntu/projects/CILProject22/data/submission/submission_test.csv')