<a href="https://colab.research.google.com/github/CristinaRacovita/licenta/blob/master/RecommenderSystem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import mean_squared_error

Read datasets (train & test) from https://grouplens.org/datasets/movielens/100k/ 

In [None]:
train_data =  pd.read_csv('/content/u1.base', sep = '\t', names = ['userId','movieId','rating','timestamp'])
train_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [None]:
test_data =  pd.read_csv('/content/u1.test', sep = '\t', names = ['userId','movieId','rating','timestamp'])
test_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198


Let's know our data better:


In [None]:
train_data.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,80000.0,80000.0,80000.0,80000.0
mean,525.1657,425.7052,3.52835,883564500.0
std,255.94956,331.383936,1.118565,5318611.0
min,1.0,1.0,1.0,874724700.0
25%,334.0,175.0,3.0,879455600.0
50%,535.0,321.0,4.0,882844100.0
75%,744.0,631.0,4.0,888267600.0
max,943.0,1682.0,5.0,893286600.0


In [None]:
test_data.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,20000.0,20000.0,20000.0,20000.0
mean,211.76095,424.82985,3.5359,883386400.0
std,121.6469,328.452907,1.15368,5441471.0
min,1.0,1.0,1.0,874724700.0
25%,99.0,174.0,3.0,879373000.0
50%,222.0,322.0,4.0,882474700.0
75%,308.0,633.0,4.0,888204500.0
max,462.0,1591.0,5.0,893277700.0


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80000 entries, 0 to 79999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   userId     80000 non-null  int64
 1   movieId    80000 non-null  int64
 2   rating     80000 non-null  int64
 3   timestamp  80000 non-null  int64
dtypes: int64(4)
memory usage: 2.4 MB


In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   userId     20000 non-null  int64
 1   movieId    20000 non-null  int64
 2   rating     20000 non-null  int64
 3   timestamp  20000 non-null  int64
dtypes: int64(4)
memory usage: 625.1 KB


In [None]:
print("no of ratings:", train_data.shape[0])
print("no of unique users:",train_data.userId.unique().shape[0])
print("no of unique movies:", train_data.movieId.unique().shape[0])
print("maximum rating:", train_data.rating.max())
print("minimum rating:", train_data.rating.min())

no of ratings: 80000
no of unique users: 943
no of unique movies: 1650
maximum rating: 5
minimum rating: 1


In [None]:
print("no of ratings:", test_data.shape[0])
print("no of unique users:",test_data.userId.unique().shape[0])
print("no of unique movies:", test_data.movieId.unique().shape[0])
print("maximum rating:", test_data.rating.max())
print("minimum rating:", test_data.rating.min())

no of ratings: 20000
no of unique users: 459
no of unique movies: 1410
maximum rating: 5
minimum rating: 1


 Using matrix factorization, we can find some latent features that can determine how a user rates a movie. We decompose the matrix into constituent parts in such a way that the product of these parts generates the original matrix.

Obs:

    np.random.normal = random samples from a normal Gaussian distribution. (scale = standard deviation)
    np.newaxis =  is used to increase the dimension of the existing array by one more dimension


In [None]:
class MF():

    # Initializing the user-movie rating matrix, no. of latent features, alpha - learning rate and beta - regularization parameter for bias.
    def __init__(self, R, K, alpha, beta, iterations):
        self.R = R
        self.num_users, self.num_items = R.shape #tupla
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations

    # Initializing user-feature and movie-feature matrix 
    def train(self):
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))

        # Initializing the bias terms treshold si derivata dif de 0, ponderi
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)]) 

        # List of training samples
        self.samples = [(i, j, self.R[i, j]) for i in range(self.num_users) for j in range(self.num_items) if self.R[i, j] > 0]

        # Stochastic gradient descent for given number of iterations
        training_process = []
        aux = 100
        for i in range(self.iterations):
          np.random.shuffle(self.samples)          
          self.sgd()
          nrmse = self.nrmse()
          # check if mse curent > last mse
          if aux - nrmse > 0:
            training_process.append((i, nrmse))
            if (i+1) % 10 == 0:
              print("Iteration: %d ; error = %.4f; ant = %.4f" % (i+1, nrmse, aux))
            aux = nrmse
          else:
            break

        return training_process

    # Computing total normal root mean squared error
    def nrmse(self):
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)/len(xs)
        return np.sqrt(error)/5

    # Stochastic gradient descent to get optimized P and Q matrix - https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/SGD 
    def sgd(self):
        for i, j, r in self.samples:
            prediction = self.get_prediction(i, j)
            e = (r - prediction)

            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])

            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])
            

    # Rating for user i and movie j
    def get_prediction(self, i, j):
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction

    # Full user-movie matrix
    def full_matrix(self):
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

In [None]:
R= np.array(train_data.pivot(index = 'userId', columns ='movieId', values = 'rating').fillna(0))
R
#alpha=[0.00001, 0.0001, 0.001, 0.005, 0.1]

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [None]:
mf = MF(R, K=20, alpha=0.001, beta=0.01, iterations=100)
training_process = mf.train()  
print()
print("P x Q:")
print(mf.full_matrix())
print()

Iteration: 10 ; error = 0.1922; ant = 0.1930
Iteration: 20 ; error = 0.1873; ant = 0.1876
Iteration: 30 ; error = 0.1851; ant = 0.1853
Iteration: 40 ; error = 0.1838; ant = 0.1839
Iteration: 50 ; error = 0.1827; ant = 0.1828
Iteration: 60 ; error = 0.1817; ant = 0.1818
Iteration: 70 ; error = 0.1806; ant = 0.1807
Iteration: 80 ; error = 0.1792; ant = 0.1793
Iteration: 90 ; error = 0.1774; ant = 0.1776
Iteration: 100 ; error = 0.1750; ant = 0.1753

P x Q:
[[3.91774997 3.2660794  3.11792289 ... 3.39359946 3.47742997 3.44731133]
 [4.0116654  3.42899284 3.20657023 ... 3.47717984 3.57819004 3.56315967]
 [3.55400966 2.98262672 2.68473175 ... 3.00566322 3.11452076 3.10752798]
 ...
 [4.22446423 3.61285858 3.40475282 ... 3.64377841 3.79126769 3.74512867]
 [4.39710472 3.7276517  3.50395199 ... 3.79642912 3.89519473 3.89227208]
 [3.68038751 3.20494192 3.00224648 ... 3.20599247 3.37423028 3.33046238]]



In [None]:
"""

err test < err train => overfitting

"""

# mf.get_rating(0,2)

'\n\nerr test < err train => overfitting\n\n'

Evaluation:

In [None]:
predicted = []
real = []

for index, row in test_data.iterrows():
  predicted_rating = mf.full_matrix()[row.userId][row.movieId]
  predicted.append(predicted_rating)
  real.append(row.rating)


error = mean_squared_error(real, predicted)
error

1.5010634998263463

Read info about movies.

In [None]:
movies_data  =  pd.read_csv('/content/u.item', encoding= 'latin-1', sep = '|', 
  names = ['movieId','movieName','releaseDate','nothing','imdbLink','unkownGenre','actionGenre','adventureGenre','animationGenre','childrensGenre','comedyGenre','crimeGenre','documentaryGenre','dramaGenre','fantasyGenre','filmNoirGenre','horrorGenre','musicalGenre','mysteryGenre','romanceGenre','sciFiGenre','thrillerGenre','warGenre','westernGenre'])
movies_data.head()

Unnamed: 0,movieId,movieName,releaseDate,nothing,imdbLink,unkownGenre,actionGenre,adventureGenre,animationGenre,childrensGenre,comedyGenre,crimeGenre,documentaryGenre,dramaGenre,fantasyGenre,filmNoirGenre,horrorGenre,musicalGenre,mysteryGenre,romanceGenre,sciFiGenre,thrillerGenre,warGenre,westernGenre
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [None]:
movies_genres = []

for index, row in movies_data.iterrows():
  genres_string = ''
  if row['unkownGenre'] == 1:
    genres_string+='unkown '
  if row['actionGenre'] == 1:
    genres_string+='action '
  if row['adventureGenre'] == 1:
    genres_string+='adventure '
  if row['animationGenre'] == 1:
    genres_string+='animation '
  if row['childrensGenre'] == 1:
    genres_string+='childrens '
  if row['comedyGenre'] == 1:
    genres_string+='comedy '
  if row['crimeGenre'] == 1:
    genres_string+='crime '
  if row['documentaryGenre'] == 1:
    genres_string+='documentary '
  if row['dramaGenre'] == 1:
    genres_string+='drama '
  if row['fantasyGenre'] == 1:
    genres_string+='fantasy '
  if row['filmNoirGenre'] == 1:
    genres_string+='filmNoir '
  if row['horrorGenre'] == 1:
    genres_string+='horror '
  if row['musicalGenre'] == 1:
    genres_string+='musical '
  if row['mysteryGenre'] == 1:
    genres_string+='mystery '
  if row['romanceGenre'] == 1:
    genres_string+='romance '
  if row['sciFiGenre'] == 1:
    genres_string+='sciFi '
  if row['thrillerGenre'] == 1:
    genres_string+='thriller '
  if row['warGenre'] == 1:
    genres_string+='war '
  if row['westernGenre'] == 1:
    genres_string+='western '
  
  movies_genres.append(genres_string)
  genres_string = ''

In [None]:
movies_data['genres'] = movies_genres
del movies_data['unkownGenre']
del movies_data['actionGenre']
del movies_data['adventureGenre']
del movies_data['westernGenre']
del movies_data['warGenre']
del movies_data['thrillerGenre']
del movies_data['sciFiGenre']
del movies_data['romanceGenre']
del movies_data['mysteryGenre']
del movies_data['musicalGenre']
del movies_data['horrorGenre']
del movies_data['filmNoirGenre']
del movies_data['fantasyGenre']
del movies_data['dramaGenre']
del movies_data['documentaryGenre']
del movies_data['crimeGenre']
del movies_data['comedyGenre']
del movies_data['childrensGenre']
del movies_data['animationGenre']
del movies_data['nothing']

movies_data.head()

Unnamed: 0,movieId,movieName,releaseDate,imdbLink,genres
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,animation childrens comedy
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,action adventure thriller
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,thriller
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,action comedy drama
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),crime drama thriller


In [None]:
def get_recommanfation(userId):
  best_recommandations = []
  all_ratings = mf.full_matrix()[userId]
  for i in range(19):
    best_recommandations.append(all_ratings.max())
    itemindex = np.where(all_ratings==all_ratings.max())
    np.delete(all_ratings, itemindex)
  return best_recommandations
    
get_recommanfation(2)

[4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595,
 4.3141752011909595]

In [None]:
train_data

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
79995,943,1067,2,875501756
79996,943,1074,4,888640250
79997,943,1188,3,888640250
79998,943,1228,3,888640275
