In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pylab as plt
import time

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

../kaggle/input\movielens-fds\movielens-fds.zip
../kaggle/input\movielens-fds\sample_submission.csv
../kaggle/input\movielens-fds\test.csv
../kaggle/input\movielens-fds\training.csv


In [6]:
def evaluate(predict_f,data_test):
    """ RMSE-based predictive performance evaluation with pandas. """
    ids_to_estimate = zip(data_test.user_id, data_test.movie_id)
    estimated = np.array([predict_f(u,i) for (u,i) in ids_to_estimate ])
    real = data_test.rating.values
    return compute_rmse(estimated, real)


def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))


## Divide the data in two sets: training and test
def assign_to_set(df):
    sampled_ids = np.random.choice(df.index,
                                   size=np.int64(np.ceil(df.index.size * 0.05)),
                                   replace=False)
    df['for_testing'] = False
    df.loc[sampled_ids, 'for_testing'] = True
    return df

In [8]:
df = pd.read_csv('../kaggle/input/movielens-fds/training.csv',index_col=0)

In [9]:
df.shape

(945180, 5)

In [10]:
df_test = pd.read_csv('../kaggle/input/movielens-fds/test.csv',index_col=0)

In [11]:
df = df.sample(100235)

In [None]:
df_test = df_test.sample(972)

In [12]:
df_test.shape

(848600, 4)

In [13]:
from sklearn.preprocessing import MultiLabelBinarizer

def genre_categorical(df):
    

    df['Genres_list'] = df['genre'].str.split('|')

    ## assign a new series to the genres_list column that contains a list of categories for each movie
    list2series = pd.Series(df.Genres_list)

    mlb = MultiLabelBinarizer()

    ## use mlb to create a new dataframe of the genres from the list for each row from the original data

    one_hot_genres = pd.DataFrame(mlb.fit_transform(list2series),columns=mlb.classes_,index=list2series.index)
    
    df = df.drop(["Genres_list","title","genre"],axis=1)
    
    return pd.merge(df,one_hot_genres, left_index=True, right_index=True)

In [14]:
df = genre_categorical(df)
df_test = genre_categorical(df_test)

In [15]:
df_test.head()

Unnamed: 0,user_id,movie_id,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1762,307,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1762,67534,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1762,2317,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1762,94011,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1762,164725,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
grouped    = df.groupby('user_id', group_keys=False).apply(assign_to_set)
df_train = df[grouped.for_testing == False]
df_val   = df[grouped.for_testing == True]
df.head()

Unnamed: 0,user_id,movie_id,rating,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
148657,4912,832,5.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
280599,1818,2916,4.0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
256760,7968,2541,2.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
720976,9576,37386,1.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
353722,9357,5481,3.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df.shape

(100235, 23)

In [24]:
class FM():
    def __init__(self,k = 40,learning_rate = 10,solver='stochastic',iterations = 100,regularizer = 0.00035,verbose = True):
        self.learning_rate = learning_rate
        self.solver=solver
        self.k = k
        self.iterations = iterations
        self.indices = {}
        self.regularizer = regularizer
        self.verbose = verbose
        
        
    def set_indices(self,X):
        i = 0
        cols = {}
        for col in X.columns:
            cols[col] = i
            i += 1

        indices = {}
        nz = 0
        for col in X.columns:
            if X[col].dtype == 'object':
                indices[cols[col]] = {}
                colset = set(X[col])
                for a in colset:
                    indices[cols[col]][a] = nz
                    nz += 1
            else:
                indices[cols[col]] = nz
                nz += 1
        self.indices = indices
        self.nz = nz
        
    
    def getIndexVal(self, col, val):
        if isinstance(self.indices[col], int) or isinstance(self.indices[col], float):
            return self.indices[col], val
        else:
            try:
                return self.indices[col][val], 1.
            except:
                #print(col, val)
                return 0., 0.
            
    def getIndexValArray(self, arr):
        out = []
        for i in range(len(arr)):
            out.append(self.getIndexVal(i, arr[i]))
        return out
    
    def initialize_params(self, X, y):
        self.set_indices(X)
        self.w0 = np.mean(y)
        self.w1 = np.zeros(self.nz)
        self.v = np.random.normal(scale=0.1,size=(self.nz, self.k))
        
    def sgd(self, X, y, verbose = True):

        learning_rate = self.learning_rate
        regularizer = self.regularizer
        w0 = self.w0
        w1 = self.w1
        v = self.v
        nz = self.nz

        m = X.shape[0]
        n = X.shape[1]
        
        #SGD
        for epoch in range(self.iterations):
            start_time = time.perf_counter()

            preds = []
            for i in range(m):

                _x = X[i, :]
                _ivals = self.getIndexValArray(_x)
                ind = {}
                sum1 = 0
                sum2 = 0
                sum3 = np.zeros(self.k)

                for col in range(n):
                    index, val = _ivals[col]
                    ind[index] = val
                    sum1 += w1[index] * val

                for f in range(self.k):
                    s1 = 0.0
                    s2 = 0.0
                    for col in range(n):
                        index, val = _ivals[col]
                        temp = v[index, f] * val
                        s1 += temp
                        s2 += temp*temp
                    sum3[f] = s1
                    sum2 += s1*s1 - s2

                y_hat = w0 + sum1 + 0.5*sum2
                y_hat = max(1., y_hat)
                y_hat = min(5., y_hat)
                res = (y_hat - y[i])
                if self.verbose:
                    preds.append(abs(res)**2)

                b = learning_rate*regularizer
                # update rule for w0
                w0 = w0 - learning_rate * res - learning_rate * w0 * regularizer

                for col in range(n):
                    #index = col
                    #if col in ind:
                    #    val = ind[col]
                    index, val = _ivals[col]
                    temp = learning_rate * val * res
                    w1[index] -= (temp + b*w1[index])
                    for f in range(self.k):
                        v[index, f] -= (temp * (sum3[f] - v[index, f] * val) + b*v[index, f])
                    #else:
                    #    w1[index] -= b*w1[index]
                    #    for f in range(self.k):
                    #        v[index, f] -= b*v[index, f]

            print("epoch {} time {} mse {}".format(epoch, time.perf_counter()-start_time, np.mean(preds)))
        self.w0 = w0
        self.w1 = w1
        self.v = v
    
    
    def fit(self,X,y):
        self.initialize_params(X, y)
        X = np.array(X)
        y = np.array(y)
        self.sgd(X, y)
        return
    
    def predict(self, X, verbose=True):

        X = np.array(X)
        
        w0 = self.w0
        w1 = self.w1
        v = self.v
        nz = self.nz

        
        m = X.shape[0]
        n = X.shape[1]

        preds = []
        for i in range(m):


            _x = X[i, :]
            _ivals = self.getIndexValArray(_x)
            ind = {}
            sum1 = 0
            sum2 = 0
            sum3 = np.zeros(self.k)

            for col in range(n):

                index, val = _ivals[col]
                ind[index] = val

                index = int(index)

                sum1 += w1[index] * val


            for f in range(self.k):
                s1 = 0
                s2 = 0
                for col in range(n):
                    index, val = _ivals[col]
                    index = int(index)
                    temp = v[index, f] * val
                    s1 += temp
                    s2 += temp*temp
                sum3[f] = s1
                s1 = s1*s1
                sum2 += s1 - s2

            y_hat = w0 + sum1 + 0.5*sum2
            y_hat = max(1., y_hat)
            y_hat = min(5., y_hat)
            preds.append(y_hat)

        return preds

In [None]:
df_train = df.sample(frac=0.8,random_state=200)

In [None]:
df_val=df.drop(df_train.index)

In [19]:
val = df_val.copy()
train = df_train.copy()

test = df_test.copy()

In [20]:
y_train = train['rating']
train.drop(['rating'], axis=1, inplace=True)

train['user_id'] = train['user_id'].astype('str')
train['movie_id'] = train['movie_id'].astype('str')

y_val = val["rating"]
val.drop(['rating'], axis=1, inplace=True)
val['user_id'] = val['user_id'].astype('str')
val['movie_id'] = val['movie_id'].astype('str')

test['user_id'] = test['user_id'].astype('str')
test['movie_id'] = test['movie_id'].astype('str')

In [21]:
print("Train size:",train.shape,
     "\ny_train size:",val.shape,
     "\ntest size:",test.shape)

Train size: (89249, 22) 
y_train size: (10986, 22) 
test size: (848600, 22)


In [25]:
f = FM(k=10, iterations = 60, learning_rate = 0.01, regularizer=0.03)

In [26]:
f.fit(X=train, y=y_train)

epoch 0 time 66.14670619999742 mse 1.038081298853243
epoch 1 time 65.90673369999786 mse 0.9135895171385224
epoch 2 time 65.82545670000036 mse 0.8605316949941623
epoch 3 time 65.96206449999954 mse 0.8261342859648161
epoch 4 time 66.35091670000111 mse 0.7999291877257593
epoch 5 time 66.68685040000128 mse 0.7780658660143074
epoch 6 time 66.26934290000281 mse 0.7587121477313422
epoch 7 time 66.62225419999959 mse 0.7408600628518739
epoch 8 time 66.13566210000135 mse 0.7239083339808544
epoch 9 time 66.27244479999717 mse 0.7074822430556812
epoch 10 time 66.3075523999978 mse 0.6913463356710418
epoch 11 time 66.59072450000167 mse 0.6753570992269241
epoch 12 time 66.60134239999752 mse 0.6594503413946365
epoch 13 time 65.71159140000236 mse 0.6436042554329985
epoch 14 time 66.63661649999995 mse 0.6278103096220201
epoch 15 time 66.56197020000036 mse 0.6120840360520245
epoch 16 time 67.52666389999649 mse 0.5964594664132343
epoch 17 time 66.76082479999968 mse 0.58098492586508
epoch 18 time 65.9778857

In [None]:
0.75
0.89

In [None]:
1094+1092+1089+1093+1090

In [27]:
y_pred = f.predict(val)

In [28]:
from sklearn.metrics import mean_squared_error

In [None]:
np.sqrt(2.7410814585956933)

In [29]:
print("MSE: {}".format(mean_squared_error(y_val, y_pred)))

MSE: 1.2562889669833912


In [None]:
print("MSE: {}".format(mean_squared_error(y_val, y_pred)))

In [None]:
0.96
1.3
1.01

In [30]:
testPred=f.predict(test)

In [None]:
from scipy import sparse


class RecSys_vanilla_mf_biases():
    """ Collaborative filtering using a custom sim(u,u'). """

    def __init__(self,df_train,df_val, num_components=10):
        """ Constructor """
        self.df_train = df_train
        self.df_val = df_val
        self.num_components=num_components
        self.train = pd.pivot_table(self.df_train[['user_id','movie_id','rating']],columns='movie_id',index='user_id',values='rating')
        
        # We create a dictionary where we will store the user_id and movie_id which correspond 
        # to each index in the Rating matrix
        
        user_index = np.arange(len(self.train.index))
        self.users = dict(zip(user_index,self.train.index ))
        self.users_id2index = dict(zip(self.train.index,user_index)) 
        
        movie_index = np.arange(len(self.train.columns))
        self.movies = dict(zip(movie_index,self.train.columns )) 
        self.movies_id2index= dict(zip(self.train.columns, movie_index))
        self.movies_index2id= dict(zip(movie_index,self.train.columns))
        self.movie_id2title = dict(df.groupby(by=['movie_id','title']).count().index)
    
    def __sdg__(self):
        for idx in self.training_indices:
            u = self.sample_row[idx]
            i = self.sample_col[idx]
            user_id = self.users[u]
            item_id = self.movies[i]
            
            prediction = self.predict(user_id, item_id)
            error = (self.ratings[u,i] - prediction) # error
            #Update latent factors
            self.user_vecs[u, :] += self.learning_rate * \
                                    (error * self.item_vecs[i, :] - self.lmbda * self.user_vecs[u,:])
            self.item_vecs[i, :] += self.learning_rate * \
                                    (error * self.user_vecs[u, :] - self.lmbda * self.item_vecs[i,:])
            
            self.bias_item[i] += self.learning_rate * (error - self.lmbda * self.bias_item[i]) 
            self.bias_user[u] += self.learning_rate * (error - self.lmbda * self.bias_user[u]) 
                
                
    def fit(self,n_epochs = 10,learning_rate =0.001,lmbda=0.1,verbose =True):
        """ We decompose the R matrix into to submatrices using the training data """
        self.verbose = verbose
        self.learning_rate = learning_rate
        self.lmbda = lmbda
        
        self.ratings = np.float32(self.train.fillna(0).values)
        self.mean_rating = self.ratings[self.ratings>0].mean() 
        self.n_users, self.n_items = self.train.shape
        self.sample_row, self.sample_col = self.ratings.nonzero()
        self.n_samples = len(self.sample_row)
        
        self.train_rmse =[]
        self.test_rmse = []
        iter_diff = 0
        
        # initialize latent vectors
        self.user_vecs = np.random.normal(scale=1./self.num_components,\
                                          size=(self.n_users, self.num_components))
        self.item_vecs = np.random.normal(scale=1./self.num_components,
                                          size=(self.n_items, self.num_components))
        self.bias_item = np.random.normal(scale=1/self.n_items,size=(self.n_items))
        self.bias_user = np.random.normal(scale=1/self.n_users,size=(self.n_users))
        

        for epoch in range(n_epochs):
            print('Epoch: {}'.format(epoch))
            
            self.training_indices = np.arange(self.n_samples)
            
            #shuffle training samples
            np.random.shuffle(self.training_indices)
            self.__sdg__()
            
            self.train_rmse.append(evaluate(self.predict,self.df_train))
            self.test_rmse.append(evaluate(self.predict,self.df_val))
            
            
            print('\tTrain rmse: %s' % self.train_rmse[-1])
            print('\tTest rmse: %s' % self.test_rmse[-1])
            
        
        if(self.verbose):
            self.__plot_learning_curves__()
    
    def __plot_learning_curves__(self):
        plt.plot(self.train_rmse,'--o',label="train_error")
        plt.plot(self.test_rmse,'--o',label="test_error")
        plt.legend()
        plt.show()
        
    def predict(self, user_id, movie_id):
        """ Single user and item prediction."""
        if(user_id in self.users_id2index):
            user_index = self.users_id2index[user_id]
        else:
            return 3 #cold start user
        if movie_id in self.movies_id2index:
            item_index = self.movies_id2index[movie_id]
            prediction =  self.mean_rating + self.user_vecs[user_index, :].dot(self.item_vecs[item_index, :].T) + self.bias_item[item_index] + self.bias_user[user_index]
        else:
            prediction = self.mean_rating # this is a new movie

        return prediction
    

In [None]:
reco = RecSys_vanilla_mf_biases(training,val,num_components=5)
reco.fit(n_epochs = 5,learning_rate=0.01,lmbda=0.5)
print('RMSE for Collaborative Recomender: %s' % evaluate(reco.predict,df_val))

In [None]:
#estimate all pairs of user_id/movie_id from the test set
ids_to_estimate = zip(df_test.user_id, df_test.movie_id)
estimated = np.array([reco.predict(u,i) for (u,i) in ids_to_estimate ])

# generate the submission file
df_test['estimated'] = estimated
# df_test.sort_values(by='estimated',ascending=False)[['user_id','movie_id']].to_csv('baseline_submision.csv',index=None)

In [None]:
pd.read_csv("/kaggle/output/training.csvbaseline_submission.csv")

In [None]:
tt = testPred.copy()

In [31]:
df_test["estimated"] = testPred

In [None]:
df_test.head()

In [32]:
df_test.sort_values(by='estimated',ascending=False)[['user_id','movie_id']].to_csv('Origi.csv',index=None)

In [33]:
def dcg_at_k(y_true, y_score, k = 10):
    """
    Discounted cumulative gain (DCG) at rank k
    
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels)
    
    y_score : array-like, shape = [n_samples]
        Predicted scores
    
    k : int
        Rank

    Returns
    -------
    dcg : float
    """
    order = np.argsort(y_score)[::-1]
    print(order)
    y_true = np.take(y_true, order[:k])
    print(y_true)
    gains = 2 ** y_true - 1
    print(gains)
    discounts = np.log2(np.arange(2, gains.size + 2))
    print(discounts)
    dcg = np.sum(gains / discounts)
    
    return dcg

In [34]:
dcg_at_k(y_val,y_pred,5)

[ 4418  1793 10673 ...   650  3120  9974]
187529    5.0
379630    3.5
255484    4.0
422085    2.0
37021     4.0
Name: rating, dtype: float64
187529    31.000000
379630    10.313708
255484    15.000000
422085     3.000000
37021     15.000000
Name: rating, dtype: float64
[1.         1.5849625  2.         2.32192809 2.5849625 ]


52.10204734441061

In [35]:
np.sqrt(0.22760168620479948)

0.47707618490635173

In [36]:
ndcgTrain = dcg_at_k(y_val,y_pred,10)
possible = dcg_at_k(y_val,y_val,10)
print('NDCG of Validation:',ndcgTrain/possible)

[ 4418  1793 10673 ...   650  3120  9974]
187529    5.0
379630    3.5
255484    4.0
422085    2.0
37021     4.0
201589    4.5
409227    4.0
776953    5.0
818550    4.5
14433     5.0
Name: rating, dtype: float64
187529    31.000000
379630    10.313708
255484    15.000000
422085     3.000000
37021     15.000000
201589    21.627417
409227    15.000000
776953    31.000000
818550    21.627417
14433     31.000000
Name: rating, dtype: float64
[1.         1.5849625  2.         2.32192809 2.5849625  2.80735492
 3.         3.169925   3.32192809 3.45943162]
126327        0
741775     8493
803596     8515
87068      3469
290860     3477
          ...  
614894    10560
560611    10313
513558     9388
49717      1139
148657     5169
Name: rating, Length: 10986, dtype: int64
148657    5.0
358020    5.0
518762    5.0
757545    5.0
133488    5.0
188295    5.0
645030    5.0
169487    5.0
531096    5.0
920924    5.0
Name: rating, dtype: float64
148657    31.0
358020    31.0
518762    31.0
757545    31.0
