In [1]:
import pandas as pd
import numpy as np

In [2]:
#passing column names for each csv while loading dataset as column names are not given in csv

#loading users data
user_cols = ['user_id','age','gender','occupation','zip code']
users = pd.read_csv('ml-100k/u.user',sep='|',names=user_cols , encoding='latin-1')
print(users.shape)
users.head()

(943, 5)


Unnamed: 0,user_id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [3]:
#loading movie(item) rating from user
ratings_cols = ['user id','item id','rating','timestamp']
ratings = pd.read_csv('ml-100k/u.data',sep='\t',names=ratings_cols, encoding='latin-1')
print(ratings.shape)
ratings.head()

(100000, 4)


Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
#loading items
item_cols = ['movie id', 'movie title', 'release date', 'video release date',
              'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
              'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
              'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item',sep='|',names=item_cols, encoding='latin-1')
print(items.shape)
items.head()

(1682, 24)


Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
#loading train and test data of rating which already divided by GroupLens

rating_train_data = pd.read_csv('ml-100k/ua.base',sep='\t',names=ratings_cols, encoding='latin-1')

#test data has 10 rating from each user
rating_test_data = pd.read_csv('ml-100k/ua.test',sep='\t',names=ratings_cols, encoding='latin-1')

rating_train_data.shape, rating_test_data.shape

((90570, 4), (9430, 4))

In [6]:
# applying algorithms to dataset


# 1. collaborative filtering model
#storing no. of unique users who has given ratings and no. of unique items
n_users_unique = ratings['user id'].unique()
n_users = n_users_unique.shape[0]

n_items_unique = ratings['item id'].unique()
n_items = n_items_unique.shape[0]

n_users, n_items

(943, 1682)

In [29]:
#creating user-item matrix to calculate similarity between them
data_matrix = np.zeros((n_users, n_items))

# itertuples is a method to traverse dataframe rows using index
for rating_row in ratings.itertuples() :
#                user id           item id           rating 
    data_matrix[rating_row[1]-1, rating_row[2]-1] = rating_row[3]
data_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [25]:
# calculating cosine similarity
from sklearn.metrics.pairwise import pairwise_distances

# user-user similarity
user_similarity = pairwise_distances(data_matrix, metric='cosine')
print(user_similarity.shape)
print('-----------------------------------------------------------------------')
# item-item similarity
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')
print(item_similarity.shape)

(943, 943)
-----------------------------------------------------------------------
(1682, 1682)


In [41]:
# making prediction algorithm
def predict(ratings, similarity, type='user') :
    pred = None
#   type == user i.e. we are taking user-user similarity into consideration
    if type == 'user' :
#       axis = 1 i.e. mean on columns per row
        mean_user_rating = ratings.mean(axis=1)
#       we use np.newaxis so that mean_user_rating has same format as ratings
#       np.newaxis is use to add extra dimension
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item' :
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [45]:
# making prediction
user_prediction = predict(data_matrix, user_similarity)

# creating matrix  of recommendation threshold = 2.3
item_recommended = np.zeros((n_users,n_items))
for i in user_prediction :
    for j in i :
        if j > 2.3 :
            item_recommended = 1

item_recommended

1

In [44]:
item_prediction = predict(data_matrix, item_similarity,type='item')

item_prediction.shape
# for i in item_prediction :
#     print(i.max())

(943, 1682)