In [None]:
from helpers import load_data, split_data
import numpy as np
import surprise as spr
from surprise import Dataset
from surprise import Reader
import os
import pandas as pd
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
import time
import re

In [None]:
data= pd.read_csv("../data/data_train.csv")
#extracting row and column numbers


In [None]:
data.Id.to_list()

In [None]:
reader = Reader(rating_scale=(1,5))
Dataset.load_from_df(data[["Row", "Col", "Prediction"]], reader=reader)

In [None]:
with open("../data/data_train.csv") as f1:
    with open("../data/data_3cols.csv", "wt") as f2:
        for l in f1.readlines()[1:]:
            id, rating = l.split(",")
            row, col = id.split("_")
            row = row[1:]
            col = col[1:]
            f2.write("{},{},{}".format(row, col, rating))

In [None]:
raw_ids, all_uid, all_iid = [], [], []
with open("../data/sampleSubmission.csv") as f1:
    for l in f1.readlines()[1:]:
        id, _ = l.split(",")
        row, col = id.split("_")
        all_uid.append(row[1:])
        all_iid.append(col[1:])
        raw_ids.append(id)

In [None]:
file_path = '../data/data_3cols.csv'
reader = Reader(line_format='user item rating', sep=",")
data = Dataset.load_from_file(file_path, reader=reader)
trainset = data.build_full_trainset()

In [None]:
try:
    df = pd.read_pickle("cache/cached_predictions.pkl")
except FileNotFoundError:
    print("No cached predictions found")
    df = pd.DataFrame(raw_ids, columns=["Id"])
    df.set_index("Id", inplace=True)

In [None]:
global algo_in_use

In [None]:
def get_ids(rid):
    u, i = rid.split("_")
    return u[1:], i[1:]
def predictor(ids_chunk):
    print("Working on a chunk")
    res_chunk = []
    for i in ids_chunk:
        uid, iid = get_ids(i)
        p = algo_in_use.predict(uid, iid)
        res_chunk.append((i, p.est))
    print("Finished chunk")
    return res_chunk
def parallelize_predictions(ids, algo, n_cores=16):
    splitted_ids = np.array_split(ids, n_cores)
    pool = Pool(n_cores)
    res = np.concatenate(pool.map(predictor, splitted_ids))
    res = [(r[0], float(r[1])) for r in res]
    pool.close()
    pool.join()
    return res

In [None]:
all_algos = {"SVD": spr.SVD(), "Baseline": spr.BaselineOnly(), "NMF": spr.NMF(), 
             "Slope One": spr.SlopeOne(), "KNN Basic": spr.KNNBasic(), 
             "KNN Means": spr.KNNWithMeans(), "KNN Baseline": spr.KNNBaseline(), 
             "KNN Zscore":spr.KNNWithZScore(), "SVD ++": spr.SVDpp()}

In [None]:
print("Starting loop")
for name in all_algos:
    print("##### {} ####".format(name))
    if name in df.columns:
        print("Already computed {}, skipping".format(name))
        continue
    algo = all_algos[name]
    time.sleep(1)
    algo.fit(trainset)
    time.sleep(1)
    algo_in_use = algo
    print("Generating predictions...")
    predictions = parallelize_predictions(raw_ids, algo, 80)
    print("Done. Merging with previous results")
    pred_df = pd.DataFrame(predictions, columns=["Id", name])
    pred_df.set_index("Id", inplace=True)
    df = pd.merge(df, pred_df, left_index=True, right_index=True)
    df.to_pickle("cache/cached_predictions.pkl")

In [None]:
df