In [16]:
from helpers import load_data, split_data
import numpy as np
import surprise as spr
from surprise import Dataset
from surprise import Reader
import os
import pandas as pd
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
import time
import re

In [23]:
data= pd.read_csv("../data/data_train.csv")
#extracting row and column numbers


In [26]:
data.Id.to_list()

['r44_c1',
 'r61_c1',
 'r67_c1',
 'r72_c1',
 'r86_c1',
 'r90_c1',
 'r108_c1',
 'r114_c1',
 'r120_c1',
 'r135_c1',
 'r152_c1',
 'r165_c1',
 'r182_c1',
 'r310_c1',
 'r318_c1',
 'r333_c1',
 'r355_c1',
 'r390_c1',
 'r401_c1',
 'r410_c1',
 'r418_c1',
 'r457_c1',
 'r470_c1',
 'r497_c1',
 'r516_c1',
 'r566_c1',
 'r595_c1',
 'r670_c1',
 'r673_c1',
 'r708_c1',
 'r720_c1',
 'r743_c1',
 'r777_c1',
 'r861_c1',
 'r872_c1',
 'r908_c1',
 'r930_c1',
 'r966_c1',
 'r967_c1',
 'r981_c1',
 'r1034_c1',
 'r1075_c1',
 'r1107_c1',
 'r1220_c1',
 'r1230_c1',
 'r1254_c1',
 'r1338_c1',
 'r1407_c1',
 'r1435_c1',
 'r1489_c1',
 'r1495_c1',
 'r1527_c1',
 'r1547_c1',
 'r1549_c1',
 'r1570_c1',
 'r1583_c1',
 'r1622_c1',
 'r1667_c1',
 'r1775_c1',
 'r1789_c1',
 'r1802_c1',
 'r1813_c1',
 'r1830_c1',
 'r1850_c1',
 'r1878_c1',
 'r1921_c1',
 'r1970_c1',
 'r2038_c1',
 'r2051_c1',
 'r2081_c1',
 'r2090_c1',
 'r2097_c1',
 'r2107_c1',
 'r2176_c1',
 'r2195_c1',
 'r2225_c1',
 'r2244_c1',
 'r2284_c1',
 'r2327_c1',
 'r2347_c1',
 'r235

In [22]:
reader = Reader(rating_scale=(1,5))
Dataset.load_from_df(data[["Row", "Col", "Prediction"]], reader=reader)

<surprise.dataset.DatasetAutoFolds at 0x7f3aa26d9e48>

In [2]:
with open("../data/data_train.csv") as f1:
    with open("../data/data_3cols.csv", "wt") as f2:
        for l in f1.readlines()[1:]:
            id, rating = l.split(",")
            row, col = id.split("_")
            row = row[1:]
            col = col[1:]
            f2.write("{},{},{}".format(row, col, rating))

In [3]:
raw_ids, all_uid, all_iid = [], [], []
with open("../data/sampleSubmission.csv") as f1:
    for l in f1.readlines()[1:]:
        id, _ = l.split(",")
        row, col = id.split("_")
        all_uid.append(row[1:])
        all_iid.append(col[1:])
        raw_ids.append(id)

In [4]:
file_path = '../data/data_3cols.csv'
reader = Reader(line_format='user item rating', sep=",")
data = Dataset.load_from_file(file_path, reader=reader)
trainset = data.build_full_trainset()

In [5]:
try:
    df = pd.read_pickle("cache/cached_predictions.pkl")
except FileNotFoundError:
    print("No cached predictions found")
    df = pd.DataFrame(raw_ids, columns=["Id"])
    df.set_index("Id", inplace=True)

In [6]:
global algo_in_use

In [7]:
def get_ids(rid):
    u, i = rid.split("_")
    return u[1:], i[1:]
def predictor(ids_chunk):
    print("Working on a chunk")
    res_chunk = []
    for i in ids_chunk:
        uid, iid = get_ids(i)
        p = algo_in_use.predict(uid, iid)
        res_chunk.append((i, p.est))
    print("Finished chunk")
    return res_chunk
def parallelize_predictions(ids, algo, n_cores=16):
    splitted_ids = np.array_split(ids, n_cores)
    pool = Pool(n_cores)
    res = np.concatenate(pool.map(predictor, splitted_ids))
    res = [(r[0], float(r[1])) for r in res]
    pool.close()
    pool.join()
    return res

In [8]:
all_algos = {"SVD": spr.SVD(), "Baseline": spr.BaselineOnly(), "NMF": spr.NMF(), 
             "Slope One": spr.SlopeOne(), "KNN Basic": spr.KNNBasic(), 
             "KNN Means": spr.KNNWithMeans(), "KNN Baseline": spr.KNNBaseline(), 
             "KNN Zscore":spr.KNNWithZScore(), "SVD ++": spr.SVDpp()}

In [9]:
print("Starting loop")
for name in all_algos:
    print("##### {} ####".format(name))
    if name in df.columns:
        print("Already computed {}, skipping".format(name))
        continue
    algo = all_algos[name]
    time.sleep(1)
    algo.fit(trainset)
    time.sleep(1)
    algo_in_use = algo
    print("Generating predictions...")
    predictions = parallelize_predictions(raw_ids, algo, 80)
    print("Done. Merging with previous results")
    pred_df = pd.DataFrame(predictions, columns=["Id", name])
    pred_df.set_index("Id", inplace=True)
    df = pd.merge(df, pred_df, left_index=True, right_index=True)
    df.to_pickle("cache/cached_predictions.pkl")

Starting loop
##### SVD ####
Already computed SVD, skipping
##### Baseline ####
Already computed Baseline, skipping
##### NMF ####
Already computed NMF, skipping
##### Slope One ####
Already computed Slope One, skipping
##### KNN Basic ####
Already computed KNN Basic, skipping
##### KNN Means ####
Computing the msd similarity matrix...
Done computing similarity matrix.
Generating predictions...
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Working on a chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Finished chunk
Done. Merging with previous results
##### KNN Baseline ###

In [10]:
df

Unnamed: 0_level_0,SVD,Baseline,NMF,Slope One,KNN Basic,KNN Means,KNN Baseline,KNN Zscore,SVD ++
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
r37_c1,3.221081,3.302156,3.432209,3.245664,3.481942,3.253983,3.354065,3.146721,3.067358
r73_c1,3.112221,3.031776,2.872050,2.953050,3.289886,3.045668,3.076222,3.028238,3.232603
r156_c1,4.060513,3.714090,3.767539,3.735032,3.690695,3.775587,3.807600,3.787213,4.178538
r160_c1,2.930088,3.294987,3.323812,3.300362,3.453819,3.133343,3.317898,2.945526,3.953791
r248_c1,3.014638,3.333168,3.243256,3.317072,3.353541,3.315370,3.255762,3.360111,3.914291
...,...,...,...,...,...,...,...,...,...
r9974_c1000,3.408777,3.675840,3.532176,3.676561,3.542406,3.451379,3.592030,3.413895,3.804892
r9977_c1000,4.004382,3.549342,3.594914,3.508128,3.727528,3.640440,3.602829,3.563078,3.769324
r9978_c1000,2.747601,3.149921,3.303986,3.097642,3.884125,3.196653,3.365188,3.151636,2.892084
r9982_c1000,3.506213,3.207369,3.184635,3.165743,3.927775,3.336339,3.485673,3.328428,3.130404
