In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/breakthrough-tech-ai-studio-challenge/sample_submission.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/movies_keywords.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/movies_metadata.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/train.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/test.csv


## Train Dataset

In [2]:
train = pd.read_csv("/kaggle/input/breakthrough-tech-ai-studio-challenge/train.csv")
train.head()

Unnamed: 0,userId_movieId,rating
0,10_1358,0.4
1,237_1544,0.7
2,54_373,1.0
3,11_2053,0.8
4,183_2524,0.6


In [3]:
train.shape

(70002, 2)

In [4]:
#split "userId_movieId" column to two 'userId', 'movieId'
train[['userId', 'movieId']] = train["userId_movieId"].apply(lambda x: pd.Series(str(x).split("_")))
train = train.drop('userId_movieId', axis=1)
train.head()

Unnamed: 0,rating,userId,movieId
0,0.4,10,1358
1,0.7,237,1544
2,1.0,54,373
3,0.8,11,2053
4,0.6,183,2524


In [5]:
train.insert(loc=3, column='rating_', value=train['rating'])
train = train.drop(['rating'], axis=1)
train.rename(columns={"rating_": "rating"}, inplace=True)

In [6]:
train.head()

Unnamed: 0,userId,movieId,rating
0,10,1358,0.4
1,237,1544,0.7
2,54,373,1.0
3,11,2053,0.8
4,183,2524,0.6


In [7]:
#Check for the dtypes and null values in our columns 
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70002 entries, 0 to 70001
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   70002 non-null  object 
 1   movieId  70002 non-null  object 
 2   rating   70002 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.6+ MB


In [8]:
#change the datatype of the id columns from object to integers
train = train.astype({"userId":"int","movieId":"int"})

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70002 entries, 0 to 70001
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   70002 non-null  int64  
 1   movieId  70002 non-null  int64  
 2   rating   70002 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.6 MB


## Create a User- Items Interactions Matrix - Collaborative Filtering method

Model based collaborative approaches only rely on user-item interactions information and assume a latent model supposed to explain these interactions. Matrix factorization algorithms consists in decomposing the huge and sparse user-item interaction matrix into a product of two smaller and dense matrices: a user-factor matrix (containing users representations) that multiplies a factor-item matrix (containing items representations).

### Singular Value Decomposition (SVD)

In [10]:
from math import sqrt

In [11]:
class SVDRecommender:

    """
    Singular Value Decomposition is an important technique used in recommendation systems.
    Using SVD, the complete utility matrix is decomposed into user and item features.
    Thus the dimensionality of the matrix is reduced and we get the most important features
    neglecting the weaker ones.
    The utility matrix is initially sparse having a lot of missing values. The missing values
    are filled in using the mean for that item.
    no_of_features: the number of the biggest features that are to be taken for each user and
                    item. default value is 15.
    method: 1. default: The mean for the item is deducted from the user-item pair value in
                        the utility matrix. SVDRecommender is computed. With the computed values, the
                        mean for the item is added back to get the final result.
    formatizer: a dict having the keys 'user', 'item' and 'value' each having an integer value
                that denotes the column numbers of the corresponding things in the array
                provided in the fit and predict method. The 'value' will be used only in the
                fit method.
    Attributes:
                instantiation outside init:
                no_of_users=int()
                no_of_items=int()
                user_index=list()
                item_index=list()
                Usk=None
                skV=None
    """

    def __init__(self,
                 no_of_features=15,
                 method='default',
                 ):
        self.parameters={"no_of_features", "method"}
        self.method = method
        self.no_of_features = no_of_features


    def get_params(self, deep=False):
        out=dict()
        for param in self.parameters:
            out[param]=getattr(self, param)

        return out


    def set_params(self, **params):

        for a in params:
            if a in self.parameters:
                setattr(self,a,params[a])
            else:
                raise AttributeError("No such attribute exists to be set")


    def create_utility_matrix(self, data, formatizer = {'user':0, 'movie': 1, 'value': 2}):
        """
        :param dataset_array:   Array-like, 2D, nx3
        :param indices:         pass the formatizer
        :return:                the utility matrix. 2D, n x m, n=users, m=items
        """
        movieField = formatizer['movie']
        userField = formatizer['user']
        valueField = formatizer['value']

        userList = data.loc[:,userField].tolist()
        movieList = data.loc[:,movieField].tolist()
        valueList = data.loc[:,valueField].tolist()

        users = list(set(data.loc[:,userField]))
        movies = list(set(data.loc[:,movieField]))

        users_index = {users[i]: i for i in range(len(users))}

        pd_dict = {movie: [np.nan for i in range(len(users))] for movie in movies}

        for i in range(0,len(data)):
            movie = movieList[i]
            user = userList[i]
            value = valueList[i]

            pd_dict[movie][users_index[user]] = value

        X = pd.DataFrame(pd_dict)
        X.index = users

        users = list(X.index)
        movies = list(X.columns)

        return np.array(X), users, movies


    def fit(self, user_item_matrix, userList, movieList):

        """
        :param X: nx3 array-like. Each row has three elements in the order given by the
                  formatizer. The userId, itemId and the value/rating.
               formatizer: to change the default format
        :return: Does not return anything. Just fits the data and forms U, s, V by SVDRecommender
        """

        self.users = list(userList)
        self.movies = list(movieList)

        self.user_index = {self.users[i]: i for i in range(len(self.users))}
        self.movie_index = {self.movies[i]: i for i in range(len(self.movies))}


        mask=np.isnan(user_item_matrix)
        masked_arr=np.ma.masked_array(user_item_matrix, mask)

        self.predMask = ~mask

        self.movie_means=np.mean(masked_arr, axis=0)
        self.user_means=np.mean(masked_arr, axis=1)
        self.movie_means_tiled = np.tile(self.movie_means, (user_item_matrix.shape[0],1))

        # utility matrix or ratings matrix that can be fed to svd
        self.utilMat = masked_arr.filled(self.movie_means)

        # for the default method
        if self.method=='default':
            self.utilMat = self.utilMat - self.movie_means_tiled


        # Singular Value Decomposition starts
        # k denotes the number of features of each user and item
        # the top matrices are cropped to take the greatest k rows or
        # columns. U, V, s are already sorted descending.

        k = self.no_of_features
        U, s, V = np.linalg.svd(self.utilMat, full_matrices=False)

        U = U[:,0:k]
        V = V[0:k,:]
        s_root = np.diag([sqrt(s[i]) for i in range(0,k)])

        self.Usk=np.dot(U,s_root)
        self.skV=np.dot(s_root,V)
        self.UsV = np.dot(self.Usk, self.skV)

        self.UsV = self.UsV + self.movie_means_tiled




    def predict(self, X, formatizer = {'user': 0, 'movie': 1}):
        """
        :param X: the test set. 2D, array-like consisting of two eleents in each row
                  corresponding to the userId and itemId
               formatizer: to change the default format
        :return: 1D, a list giving the value/rating corresponding to each user-item
                 pair in each row of X.
        """

        users = X.loc[:,formatizer['user']].tolist()
        movies = X.loc[:,formatizer['movie']].tolist()

        if self.method == 'default':

            values = []
            for i in range(len(users)):
                user = users[i]
                movie = movies[i]

                # user and item in the test set may not always occur in the train set. In these cases
                # we can not find those values from the utility matrix.
                # That is why a check is necessary.
                # 1. both user and item in train
                # 2. only user in train
                # 3. only item in train
                # 4. none in train

                if user in self.user_index:
                    if movie in self.movie_index:
                        values.append( self.UsV[self.user_index[user], self.movie_index[item]] )
                    else:
                        values.append( self.user_means[ self.user_index[user] ] )

                elif movie in self.movie_index and user not in self.user_index:
                    values.append( self.movie_means[self.movie_index[movie] ])

                else:
                    values.append(np.mean(self.movie_means)*0.6 + np.mean(self.user_means)*0.4)

        return values


    def topN_similar(self, x, column='movie', N=10, weight=True):

        """
        Gives out the most similar contents compared to the input content given. For an user input gives out similar
        users. For an item input, gives out the most similar items.
        :param x: the identifier string for the user or item.
        :param column: either 'user' or 'item'
        :param N: The number of best matching similar content to output
        :param weight: True or False. True means the feature differences are weighted. Puts more penalty on the differences
        between bigger features.
        :return: A list of tuples.
        """
        out=list()

        if column=='user':
            if x not in self.user_index:
                raise Exception("Invalid user")
            else:
                for user in self.user_index:
                    if user != x:
                        temp = dissimilarity(self.Usk[self.user_index[user],:], self.Usk[self.user_index[x],:], weighted=weight)
                        out.append((user, temp))
        if column=='movie':
            if x not in self.movie_index:
                raise Exception("Invalid movie")
            else:
                for movie in self.item_index:
                    if movie != x:
                        temp = dissimilarity(self.skV[:, self.movie_index[item]], self.skV[:, self.movie_index[x]], weighted=weight)
                        out.append((movie, temp))

        out = special_sort(out, order='ascending')
        out = out[:N]
        return out


    def recommend(self, users_list, N=10, values = False):

        # utilMat element not zero means that element has already been
        # discovered by the user and can not be recommended
        predMat = np.ma.masked_where(self.predMask, self.UsV).filled(fill_value=-999)
        out = []

        if values == True:
            for user in users_list:
                try:
                    j = self.user_index[user]
                except:
                    raise Exception("Invalid user:", user)
                max_indices = predMat[j,:].argsort()[-N:][::-1]
                out.append( [(self.movies[index],predMat[j,index]) for index in max_indices ] )

        else:
            for user in users_list:
                try:
                    j = self.user_index[user]
                except:
                    raise Exception("Invalid user:", user)
                max_indices = predMat[j,:].argsort()[-N:][::-1]
                out.append( [self.items[index] for index in max_indices ] )


        return out

### Test

In [12]:
test = pd.read_csv("/kaggle/input/breakthrough-tech-ai-studio-challenge/test.csv")
test.head()

Unnamed: 0,userId_movieId
0,469_2124
1,439_3753
2,522_1682
3,429_1217
4,71_1210


In [13]:
test[['userId', 'movieId']] = test["userId_movieId"].apply(lambda x: pd.Series(str(x).split("_")))
test = test.drop('userId_movieId', axis=1)
test.head()

Unnamed: 0,userId,movieId
0,469,2124
1,439,3753
2,522,1682
3,429,1217
4,71,1210


In [14]:
svd = SVDRecommender(no_of_features=18)

# Creates the user-item matrix, the userIds on the rows and the itemIds on the columns.
user_item_matrix, users, movies = svd.create_utility_matrix(train, formatizer={'user':'userId', 'movie':'movieId', 'value':'rating'})

# fits the svd model to the matrix data.
svd.fit(user_item_matrix, users, movies)

# predict the ratings from test set
preds = svd.predict(test, formatizer = {'user':'userId', 'movie': 'movieId'})

In [15]:
# preds

In [16]:
results = test.copy()
results

Unnamed: 0,userId,movieId
0,469,2124
1,439,3753
2,522,1682
3,429,1217
4,71,1210
...,...,...
29997,305,2599
29998,22,2109
29999,534,2947
30000,558,4085


In [17]:
results['userId_movieId'] = results.userId.astype(str).str.cat(results.movieId.astype(str), sep='_')
results = results.drop(['userId', 'movieId'], axis=1)
results["rating"] = preds
results

Unnamed: 0,userId_movieId,rating
0,469_2124,0.688909
1,439_3753,0.688909
2,522_1682,0.688909
3,429_1217,0.688909
4,71_1210,0.688909
...,...,...
29997,305_2599,0.688909
29998,22_2109,0.688909
29999,534_2947,0.688909
30000,558_4085,0.688909


In [18]:
results.to_csv('submission.csv',index=False)

In [19]:
# from sklearn.decomposition import TruncatedSVD
# svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
# result = svd.fit_transform(user_item_matrix_np)
# result.shape