# Imports

In [16]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from scipy.sparse import hstack, diags
from scipy import sparse
from collections import defaultdict

# Processing data

In [None]:
from google.colab import files
uploaded = files.upload()

In [17]:
row_lim = 1000000
df = pd.read_csv('data.csv', header=0, names=['User_id', 'Rating', 'Date', 'Movie_id'], nrows=row_lim)
df = df.drop('Date', axis=1)
encoder = OneHotEncoder(categories='auto')
users = encoder.fit_transform(np.asarray(df['User_id']).reshape(-1,1))
movies = encoder.fit_transform(np.asarray(df['Movie_id']).reshape(-1,1))
rating = defaultdict(list)
for i, row in df.iterrows():
    rating[row['User_id']] = (row['Movie_id'], row['Rating'])
    if (100 * i) % row_lim == 0:
        p = int(100 * i / row_lim)
others = np.zeros(movies.shape)
for i, row in df.iterrows():
    grade = rating[row['User_id']]
    others[i][grade[0] - 1] = grade[1]/10
    if (100 * i) % row_lim == 0:
        p = int(100 * i / row_lim)
X = hstack([np.ones(row_lim).reshape(-1,1), users, movies, others]).tocsr()
ratings = np.asarray(df['Rating']).reshape(-1,1)
X, ratings = shuffle(X, ratings)
y = ratings

# Prediction

In [18]:
def predict(X, w, V):
    a = np.sum(np.square(X.dot(V)), axis=1).reshape(-1,1)
    b = np.sum(X.power(2).dot(np.square(V)), axis=1).reshape(-1,1)
    return X.dot(w) + 0.5 * (a - b)

def grad_desc(X, y, eta=0.01, n_steps=1000):
    weight_dist = np.inf
    w = np.zeros((X.shape[1], 1))
    V = np.zeros((X.shape[1], 2))
    for i in range(n_steps):
        w += (2 * eta / len(y)) * X.T.dot(y - X.dot(w))
        precomp = X.dot(V)
        y_pred = predict(X, w, V)
        for i in range(V.shape[1]):
            d1 = diags(np.array(precomp)[:, i])
            a = d1.dot(X)
            d2 = diags(V[:, i])
            b = X.power(2).dot(d2)
            V[:, i] += (2 * eta / X.shape[1]) * (a - b).T.dot(y - y_pred).reshape((-1,))
        y_pred = predict(X, w, V)
    return w, V

def RMSE(y, y_pred):
    return np.sqrt(1 / len(y) * sum((y - y_pred)**2))

kf = KFold(n_splits=5, shuffle=True)
df_res = pd.DataFrame(columns=['name'])
names = ['RMSE train', 'RMSE test']
df_res['name'] = names
i = 0
for train_i, test_i in kf.split(X):
    i += 1
    X_train, X_test = X[train_i], X[test_i]
    y_train, y_test = y[train_i], y[test_i] 
    results = grad_desc(X_train, y_train, eta=0.2, n_steps=20)
    w, V = results
    pred_train = predict(X_train, w, V)
    pred_test = predict(X_test, w, V)
    RMSE_train = RMSE(y_train, pred_train)
    RMSE_test = RMSE(y_test, pred_test)
    stats = np.concatenate((RMSE_train, RMSE_test))
    df_res['T' + str(i)] = stats    

df_res['mean'] = df_res.mean(axis=1)
df_res['std'] = df_res.std(axis=1)
df_res.set_index('name')

Unnamed: 0_level_0,T1,T2,T3,T4,T5,mean,std
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
RMSE train,1.043392,1.043928,1.043066,1.043187,1.043659,1.043447,0.000314
RMSE test,1.043598,1.041657,1.045019,1.044565,1.042501,1.043468,0.001252
