In [1]:
import numpy as np
import pandas as pd

from scipy.sparse import hstack, diags

from collections import defaultdict

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

from pathlib import Path

In [2]:
config = defaultdict(list)
config['rows'] = 500000
config['theta'] = 0.1
config['steps'] = 20
config['file'] = 'L:/projects/combined_data_1.txt'

In [3]:
filePath = Path('data.csv')
if not filePath.exists():
    data = open('data.csv',     mode = 'w')
    file = open(config['file'], mode = 'r')
    rows = list()
    for line in file:
        del rows[:]
        line = line.strip()
        if line.endswith(':'):
            film_id = line.replace(':', '')
        else:
            rows = [el for el in line.split(',')]
            rows.append(film_id)
            data.write(','.join(rows) + '\n')
    file.close()
    data.close()

In [4]:
df = pd.read_csv('data.csv', header=0, names=['user_id', 'rating', 'date', 'film_id'], nrows=config['rows'])
df = df.drop('date', axis=1)
encoder = OneHotEncoder()
users = encoder.fit_transform(np.asarray(df['user_id']).reshape(-1, 1))
films = encoder.fit_transform(np.asarray(df['film_id']).reshape(-1, 1))
rating = defaultdict(list)
graded = np.zeros(films.shape)
folds = KFold(shuffle=True)
dfTable = pd.DataFrame(columns=['Type'])
dfTable['Type'] = ['RMSE']

In [5]:
for i, row in df.iterrows():
    rating[row['user_id']] = (row['film_id'], row['rating'])
    graded[i][row['film_id'] - 1] = row['rating']/10

x = hstack([np.ones(config['rows']).reshape(-1,1), users, films, graded]).tocsr()
y = np.asarray(df['rating']).reshape(-1,1)
x, y = shuffle(x, y)

In [6]:
def predict(x, w, v):
    return x.dot(w) + 0.5 * (
        np.sum(np.square(x.dot(v)), axis=1).reshape(-1,1) -
        np.sum(x.power(2).dot(np.square(v)), axis=1).reshape(-1,1)
    )

In [7]:
def gd(x, y, theta, steps):
    w = np.zeros((x.shape[1], 1))
    v = np.zeros((x.shape[1], 2))
    
    for i in range(steps):
        w += (2 * theta / len(y)) * x.T.dot(y - x.dot(w))
    
        xv = x.dot(v)
        pred = predict(x, w, v)
    
        for i in range(v.shape[1]):
            v[:, i] += (2 * theta / x.shape[1]) * (
                diags(np.array(xv)[:, i]).dot(x) -
                x.power(2).dot(diags(v[:, i]))
            ).T.dot(y - pred).reshape((-1,))
        
        pred = predict(x, w, v)
        
    return w, v

In [8]:
for ind, [ train, test ] in enumerate(folds.split(x)):
    w, v = gd(x[train], y[train], config['theta'], config['steps'])
    dfTable[ind + 1] = mean_squared_error(y[test], predict(x[test], w, v), squared = False)
dfTable.set_index('Type')

Unnamed: 0_level_0,1,2,3,4,5
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RMSE,1.054042,1.056073,1.050988,1.056912,1.056926
