# Recommender System

Source : machinelearningcoban.com

In [1]:
import pandas as pd 
#Reading user file:
u_cols =  ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols,
 encoding='latin-1')

n_users = users.shape[0]
print('Number of users:', n_users)
users.head() #uncomment this to see some few examples

Number of users: 943


Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [2]:
#Reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ua.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.values
rate_test = ratings_test.values

print('Number of training ratings:', rate_train.shape[0])
print('Number of test ratings:', rate_test.shape[0])


Number of training ratings: 90570
Number of test ratings: 9430


In [3]:
# Reading items file:
i_cols = ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
          'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
          'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')

n_items = items.shape[0]
print('Number of items:', n_items)

Number of items: 1682


In [4]:
items.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
X0 = items.values
X_train_counts = X0[:, -19:]
print(X_train_counts[0])

[0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=True, norm ='l2')
tfidf = transformer.fit_transform(X_train_counts.tolist()).toarray()

In [7]:
print(tfidf[0])

[0.         0.         0.         0.74066017 0.57387209 0.34941857
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.        ]


In [8]:
import numpy as np
def get_items_rated_by_user(rate_matrix, user_id):
    """
    in each line of rate_matrix, we have infor: user_id, item_id, rating (scores), time_stamp
    we care about the first three values
    return (item_ids, scores) rated by user user_id
    """
    y = rate_matrix[:,0] # all users
    # item indices rated by user_id
    # we need to +1 to user_id since in the rate_matrix, id starts from 1 
    # while index in python starts from 0
    ids = np.where(y == user_id +1)[0] 
    item_ids = rate_matrix[ids, 1] - 1 # index starts from 0 
    scores = rate_matrix[ids, 2]
    return (item_ids, scores)

In [10]:
from sklearn.linear_model import Ridge
from sklearn import linear_model

d = tfidf.shape[1] # data dimension
W = np.zeros((d, n_users))
b = np.zeros((1, n_users))

for n in range(n_users):    
    ids, scores = get_items_rated_by_user(rate_train, n)
    clf = Ridge(alpha=0.01, fit_intercept  = True)
    Xhat = tfidf[ids, :]
    if n < 1:
      print(Xhat, '\n', scores)
    
    clf.fit(Xhat, scores) 
    W[:, n] = clf.coef_
    b[0, n] = clf.intercept_


[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.53676706 0.65097024 ... 0.53676706 0.         0.        ]
 [0.         0.         0.         ... 1.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.56572542 0.         0.        ]
 [0.         0.40021819 0.48536908 ... 0.         0.5731727  0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]] 
 [5 3 4 3 3 5 4 1 5 3 2 5 5 5 5 5 3 4 5 1 4 4 3 4 3 2 4 1 3 3 5 2 1 2 2 3 4
 3 2 5 4 5 5 4 4 5 3 5 4 4 3 3 5 4 5 4 5 5 3 2 5 4 4 3 4 3 3 3 4 3 1 4 4 4
 1 4 4 5 5 3 4 3 5 5 4 5 4 5 3 5 2 4 5 3 4 3 5 2 2 1 1 2 4 4 5 5 1 5 1 5 5
 5 3 3 5 1 4 3 4 5 3 2 5 4 5 3 1 4 4 4 4 3 5 1 3 1 3 2 1 4 2 4 3 2 2 5 4 5
 3 5 4 4 3 3 4 4 4 3 5 5 2 5 5 5 5 5 5 5 5 5 5 3 3 5 4 5 4 4 4 4 3 5 5 4 4
 4 5 5 5 5 4 3 3 4 5 3 4 5 5 4 4 3 4 2 4 3 5 3 3 1 3 5 4 5 5 2 3 4 5 4 4 1
 3 2 4 5 4 2 4 4 3 4 5 1 2 2 5 1 4 4 4 4 2 5 1 2 4 4 5 1 1 1 3 1 2 1 4 5 5
 5 2 3]


In [None]:
print(ids[1], scores[1])

109 3


In [None]:
Yhat = tfidf.dot(W) + b

In [None]:
n = 10
np.set_printoptions(precision=2) # 2 digits after . 
ids, scores = get_items_rated_by_user(rate_test, n)
Yhat[n, ids]
print('Rated movies ids :', ids )
print('True ratings     :', scores)
print('Predicted ratings:', Yhat[ids, n])

Rated movies ids : [ 37 109 110 226 424 557 722 724 731 739]
True ratings     : [3 3 4 3 4 3 5 3 3 4]
Predicted ratings: [3.18 3.13 3.42 3.09 3.35 5.2  4.01 3.35 3.42 3.72]
