In [1]:
# The data is using movie lens here, from 
## F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. 
## ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19 pages. 
## DOI=http://dx.doi.org/10.1145/2827872

import pandas as pd
import numpy as np

In [2]:
root_folder = 'ml-100k/'
rating_data = root_folder + 'u.data'
user_data = root_folder + 'u.user'
item_data = root_folder + 'u.item'

rating_train = root_folder + 'ua.base'
rating_test = root_folder + 'ua.test'

In [13]:
# The original data do not have column names in the csv file, you need to check their ReadMe, and add column names
## the data here can contain duplicated userid or itemid
rating_data_cols = ['userid', 'itemid', 'rating', 'timestamp']
rating_data_df = pd.read_csv(rating_data, sep='\t', names = rating_data_cols, encoding='latin-1')
print rating_data_df.shape
rating_data_df.head()

(100000, 4)


Unnamed: 0,userid,itemid,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [14]:
# user profile, but we don't really need this piece of data here for collaborative filtering
user_data_cols = ['userid', 'age', 'gender', 'occupation', 'zip_code']
user_data_df = pd.read_csv(user_data, sep='|', names = user_data_cols, encoding='latin-1')
print user_data_df.shape
user_data_df.head()

(943, 5)


Unnamed: 0,userid,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [15]:
# movie data, but we don't really need this piece of data here for collaborative filtering
item_data_cols = ['movieid', 'movie_title ', 'release_date', 'video_release_date', 
                  'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children', 
                  'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film_Noir', 'Horror',
                  'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
item_data_df = pd.read_csv(item_data, sep='|', names = item_data_cols, encoding='latin-1')
print item_data_df.shape
item_data_df.head()

(1682, 24)


Unnamed: 0,movieid,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
# Method 1 - DIY collaborative filtering

user_ct = rating_data_df['userid'].unique().shape[0]
item_ct = rating_data_df['itemid'].unique().shape[0]
print(user_ct, item_ct)

(943, 1682)


In [9]:
for line in rating_data_df.itertuples():
    print line
    print line[1]
    break

Pandas(Index=0, userid=196, itemid=242, rating=3, timestamp=881250949)
196


In [10]:
data_matrix = np.zeros((user_ct, item_ct))

for r in rating_data_df.itertuples():
    data_matrix[r[1]-1, r[2]-1] = r[3]
    
data_matrix

array([[ 5.,  3.,  4., ...,  0.,  0.,  0.],
       [ 4.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 5.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  5.,  0., ...,  0.,  0.,  0.]])

In [11]:
# calculate similarity
from sklearn.metrics.pairwise import pairwise_distances 

user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

user_similarity

array([[ 0.        ,  0.83306902,  0.95254046, ...,  0.85138306,
         0.82049212,  0.60182526],
       [ 0.83306902,  0.        ,  0.88940868, ...,  0.83851522,
         0.82773219,  0.89420212],
       [ 0.95254046,  0.88940868,  0.        , ...,  0.89875744,
         0.86658385,  0.97344413],
       ..., 
       [ 0.85138306,  0.83851522,  0.89875744, ...,  0.        ,
         0.8983582 ,  0.90488042],
       [ 0.82049212,  0.82773219,  0.86658385, ...,  0.8983582 ,
         0.        ,  0.81753534],
       [ 0.60182526,  0.89420212,  0.97344413, ...,  0.90488042,
         0.81753534,  0.        ]])

In [12]:
print(data_matrix.shape)
print(user_similarity.shape, item_similarity.shape)

(943, 1682)
((943, 943), (1682, 1682))


In [14]:
# user-user recommendation: predict the score that each item can be recommended to each user
rating_mean = data_matrix.mean(axis=1)
ratings_diff = data_matrix - rating_mean[:, np.newaxis]
user_recommendation = rating_mean[:, np.newaxis] \
            + user_similarity.dot(ratings_diff)/np.array([np.abs(user_similarity).sum(axis=1)]).T
print(user_recommendation.shape)
user_recommendation

(943, 1682)


array([[ 2.06532606,  0.73430275,  0.62992381, ...,  0.39359041,
         0.39304874,  0.3927712 ],
       [ 1.76308836,  0.38404019,  0.19617889, ..., -0.08837789,
        -0.0869183 , -0.08671183],
       [ 1.79590398,  0.32904733,  0.15882885, ..., -0.13699223,
        -0.13496852, -0.13476488],
       ..., 
       [ 1.59151513,  0.27526889,  0.10219534, ..., -0.16735162,
        -0.16657451, -0.16641377],
       [ 1.81036267,  0.40479877,  0.27545013, ..., -0.00907358,
        -0.00846587, -0.00804858],
       [ 1.8384313 ,  0.47964837,  0.38496292, ...,  0.14686675,
         0.14629808,  0.14641455]])

In [15]:
# item-item prediction: predict the score that each item can be recommended to each user
item_recommendation = data_matrix.dot(item_similarity)/np.array([np.abs(item_similarity).sum(axis=1)])
print(item_recommendation.shape)
item_recommendation

(943, 1682)


array([[ 0.44627765,  0.475473  ,  0.50593755, ...,  0.58815455,
         0.5731069 ,  0.56669645],
       [ 0.10854432,  0.13295661,  0.12558851, ...,  0.13445801,
         0.13657587,  0.13711081],
       [ 0.08568497,  0.09169006,  0.08764343, ...,  0.08465892,
         0.08976784,  0.09084451],
       ..., 
       [ 0.03230047,  0.0450241 ,  0.04292449, ...,  0.05302764,
         0.0519099 ,  0.05228033],
       [ 0.15777917,  0.17409459,  0.18900003, ...,  0.19979296,
         0.19739388,  0.20003117],
       [ 0.24767207,  0.24489212,  0.28263031, ...,  0.34410424,
         0.33051406,  0.33102478]])

In [16]:
# Method 2 - using turicreate collaborative filtering
import turicreate

ua_cols = ['userid', 'movieid', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv(rating_train, sep='\t', names=ua_cols, encoding='latin-1')
ratings_test = pd.read_csv(rating_test, sep='\t', names=ua_cols, encoding='latin-1')
print ratings_train.shape, ratings_test.shape

(90570, 4) (9430, 4)


In [18]:
train_data = turicreate.SFrame(ratings_train)
test_data = turicreate.SFrame(ratings_test)
train_data.head()

userid,movieid,rating,unix_timestamp
1,1,5,874965758
1,2,3,876893171
1,3,4,878542960
1,4,3,876893119
1,5,3,889751712
1,6,5,887431973
1,7,4,875071561
1,8,1,875072484
1,9,5,878543541
1,10,3,875693118


In [20]:
## Turicreate - recommend most popular items (in fact this will recommend the same thing to everyone)
popularity_model = turicreate.popularity_recommender\
                   .create(train_data, user_id='userid', item_id='movieid', target='rating')

In [24]:
# see, everyone gets the same recommendation
popularity_recomm = popularity_model.recommend(users=[4,10,7,9],k=3)
popularity_recomm.print_rows(num_rows=12)

+--------+---------+-------+------+
| userid | movieid | score | rank |
+--------+---------+-------+------+
|   4    |   1189  |  5.0  |  1   |
|   4    |   1122  |  5.0  |  2   |
|   4    |   814   |  5.0  |  3   |
|   10   |   1189  |  5.0  |  1   |
|   10   |   1122  |  5.0  |  2   |
|   10   |   814   |  5.0  |  3   |
|   7    |   1189  |  5.0  |  1   |
|   7    |   1122  |  5.0  |  2   |
|   7    |   814   |  5.0  |  3   |
|   9    |   1189  |  5.0  |  1   |
|   9    |   1122  |  5.0  |  2   |
|   9    |   814   |  5.0  |  3   |
+--------+---------+-------+------+
[12 rows x 4 columns]



In [26]:
## Turicreate - collaborative filtering
#Training the model
item_sim_model = turicreate.item_similarity_recommender\
            .create(train_data, user_id='userid', item_id='movieid', target='rating', similarity_type='cosine')

In [27]:
# Making recommendations
item_sim_recomm = item_sim_model.recommend(users=[4,10,7,9],k=3)
item_sim_recomm.print_rows(num_rows=12)

+--------+---------+----------------+------+
| userid | movieid |     score      | rank |
+--------+---------+----------------+------+
|   4    |    50   | 1.13114770821  |  1   |
|   4    |   288   | 1.04871511459  |  2   |
|   4    |    56   | 0.996869802475 |  3   |
|   10   |   204   | 1.30251427115  |  1   |
|   10   |   423   | 1.22470301081  |  2   |
|   10   |   172   | 1.20107427445  |  3   |
|   7    |    88   | 0.48335224936  |  1   |
|   7    |    95   | 0.470542855845 |  2   |
|   7    |   209   | 0.422433135012 |  3   |
|   9    |   172   | 1.51975569129  |  1   |
|   9    |   204   | 1.46488795678  |  2   |
|   9    |   174   |  1.4442871958  |  3   |
+--------+---------+----------------+------+
[12 rows x 4 columns]

