In [None]:
# importing libraries
# !pip install turicreate
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances 
# import turicreate
import os

In [None]:
!unzip "/content/ml-100k.zip"

Archive:  /content/ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


In [None]:

# pass in column names for each CSV as the column name is not given in the file and read them using pandas.
# You can check the column names from the readme file

# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols,encoding='latin-1')

# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')

# reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,encoding='latin-1')

# After loading the dataset, we should look at the content of each file (users, ratings, items).

# Looking at the user file
print("\nUser Data :")
print("shape : ", users.shape)
print(users.head())

# We have 943 users in the dataset and each user has 5 features, i.e. user_ID, age, sex, occupation and zip_code. Now let’s look at the ratings file.

# Ratings Data
print("\nRatings Data :")
print("shape : ", ratings.shape)
print(ratings.head())

# We have 100k ratings for different user and movie combinations. Now finally examine the items file.

# Item Data
print("\nItem Data :")
print("shape : ", items.shape)
print(items.head())


User Data :
shape :  (943, 5)
   user_id  age sex  occupation zip_code
0        1   24   M  technician    85711
1        2   53   F       other    94043
2        3   23   M      writer    32067
3        4   24   M  technician    43537
4        5   33   F       other    15213

Ratings Data :
shape :  (100000, 4)
   user_id  movie_id  rating  unix_timestamp
0      196       242       3       881250949
1      186       302       3       891717742
2       22       377       1       878887116
3      244        51       2       880606923
4      166       346       1       886397596

Item Data :
shape :  (1682, 24)
   movie id        movie title release date  ...  Thriller War  Western
0         1   Toy Story (1995)  01-Jan-1995  ...         0   0        0
1         2   GoldenEye (1995)  01-Jan-1995  ...         1   0        0
2         3  Four Rooms (1995)  01-Jan-1995  ...         1   0        0
3         4  Get Shorty (1995)  01-Jan-1995  ...         0   0        0
4         5     Copycat

In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_train = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_train.shape, ratings_test.shape

((90570, 4), (9430, 4))

In [None]:
#We will recommend movies based on user-user similarity and item-item similarity. For that, first we need to calculate the number of unique users and movies.
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]
#Now, we will create a user-item matrix which can be used to calculate the similarity between users and items.
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]
#Now, we will calculate the similarity. We can use the pairwise_distance function 
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')
#This gives us the item-item and user-user similarity in an array form. 


In [None]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [None]:
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [None]:

train_data = turicreate.SFrame(ratings_train)
test_data = turicreate.SFrame(ratings_test)

In [None]:
#First we’ll build a model which will recommend movies based on the most popular choices, i.e., a model where all the users receive the same recommendation(s). We will use the turicreate recommender function popularity_recommender for this.
popularity_model = turicreate.popularity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating')

In [None]:
#We will recommend the top 5 items for the first 5 users in our dataset.
popularity_recomm = popularity_model.recommend(users=[1],k=20)
popularity_recomm.print_rows(num_rows=25)

+---------+----------+--------------------+------+
| user_id | movie_id |       score        | rank |
+---------+----------+--------------------+------+
|    1    |   1500   |        5.0         |  1   |
|    1    |   1467   |        5.0         |  2   |
|    1    |   1122   |        5.0         |  3   |
|    1    |   1656   |        5.0         |  4   |
|    1    |   1599   |        5.0         |  5   |
|    1    |   1201   |        5.0         |  6   |
|    1    |   1536   |        5.0         |  7   |
|    1    |   814    |        5.0         |  8   |
|    1    |   1189   |        5.0         |  9   |
|    1    |   1293   |        5.0         |  10  |
|    1    |   1449   | 4.714285714285714  |  11  |
|    1    |   1642   |        4.5         |  12  |
|    1    |   1463   |        4.5         |  13  |
|    1    |   1398   |        4.5         |  14  |
|    1    |   1594   |        4.5         |  15  |
|    1    |   408    | 4.480769230769231  |  16  |
|    1    |   318    | 4.475836

In [None]:
#After building a popularity model, we will now build a collaborative filtering model
# Let’s train the item similarity model and make top 5 recommendations for the first 5 users
#Training the model
item_sim_model = turicreate.item_similarity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='rating', similarity_type='cosine')

In [None]:
#Making recommendations
item_sim_recomm = item_sim_model.recommend(users=[4],k=9)
item_sim_recomm.print_rows(num_rows=25)

+---------+----------+--------------------+------+
| user_id | movie_id |       score        | rank |
+---------+----------+--------------------+------+
|    4    |    50    | 1.1311477082116264 |  1   |
|    4    |   288    | 1.0487151145935059 |  2   |
|    4    |   181    | 0.9505999386310577 |  3   |
|    4    |   302    | 0.9139021464756557 |  4   |
|    4    |   121    | 0.8993381602423531 |  5   |
|    4    |   100    | 0.8955540614468711 |  6   |
|    4    |    56    | 0.8828046492167881 |  7   |
|    4    |   333    | 0.8780901687485831 |  8   |
|    4    |   313    | 0.8764417426926749 |  9   |
+---------+----------+--------------------+------+
[9 rows x 4 columns]



In [None]:
users=pd.read_csv('/content/users (1).csv')

In [None]:
for user in users['_id']:
  Users.append(user)


In [None]:
ratings_train.iloc[0:100, :] 
ratings_train = ratings_train[ratings_train.user_id<=20]
ratings_train = ratings_train[ratings_train.movie_id<=100]
j=1
for i in data:
  ratings_train['movie_id'].replace([j], i,inplace=True)
  j+=1
j=1
for i in Users:
  ratings_train['user_id'].replace([j], i,inplace=True)
  j+=1
ratings_train=ratings_train.rename(columns={'movie_id':'product_id'})

In [None]:
ratings_train=ratings_train.rename(columns={'user_id':'user',"producte_id":"product"})

In [None]:
ratings.drop()

Unnamed: 0,user,product_id,rating,unix_timestamp
0,619fba11b41bb8a047a1caab,61ae0f4165c46dc923ff2271,5,874965758
1,619fba11b41bb8a047a1caab,61ae0f4165c46dc923ff2272,3,876893171
2,619fba11b41bb8a047a1caab,61ae0f4165c46dc923ff2273,4,878542960
3,619fba11b41bb8a047a1caab,61ae0f4165c46dc923ff2274,3,876893119
4,619fba11b41bb8a047a1caab,61ae0f4165c46dc923ff2275,3,889751712
...,...,...,...,...
2816,61bf8e09e04aa7bf98fd69e5,61ae0f4165c46dc923ff22c2,4,879669697
2817,61bf8e09e04aa7bf98fd69e5,61ae0f4165c46dc923ff22c7,5,879669746
2818,61bf8e09e04aa7bf98fd69e5,61ae0f4165c46dc923ff22ce,2,879669954
2819,61bf8e09e04aa7bf98fd69e5,61b624c134fe96ee02d85a53,3,879669181


In [None]:
ratings_train.to_csv('/content/Recommandation.csv', index=False)

In [None]:
ratings_train=pd.read_csv("/conte")

In [None]:
#First we’ll build a model which will recommend movies based on the most popular choices, i.e., a model where all the users receive the same recommendation(s). We will use the turicreate recommender function popularity_recommender for this.
train_data = turicreate.SFrame(ratings_train)
popularity_model = turicreate.popularity_recommender.create(train_data, user_id='user', item_id='product', target='rating')

In [None]:
#We will recommend the top 5 items for the first 5 users in our dataset.
popularity_recomm = popularity_model.recommend(users=['61ba7e4de30e004b037482i9','61ba7d6ee30e004b03748269','61ba7d99e30e004b03748277'],k=4)
popularity_recomm.print_rows(num_rows=200)

+--------------------------+--------------------------+-------+------+
|           user           |         product          | score | rank |
+--------------------------+--------------------------+-------+------+
| 61ba7e4de30e004b037482i9 | 61ae0f4165c46dc923ff22a7 |  5.0  |  1   |
| 61ba7e4de30e004b037482i9 | 61ae0f4165c46dc923ff229c |  5.0  |  2   |
| 61ba7e4de30e004b037482i9 | 61ae0f4165c46dc923ff2280 |  5.0  |  3   |
| 61ba7e4de30e004b037482i9 | 61ae0f4165c46dc923ff2276 |  5.0  |  4   |
| 61ba7d6ee30e004b03748269 | 61ae0f4165c46dc923ff22a7 |  5.0  |  1   |
| 61ba7d6ee30e004b03748269 | 61ae0f4165c46dc923ff229c |  5.0  |  2   |
| 61ba7d6ee30e004b03748269 | 61ae0f4165c46dc923ff2280 |  5.0  |  3   |
| 61ba7d6ee30e004b03748269 | 61ae0f4165c46dc923ff2276 |  5.0  |  4   |
| 61ba7d99e30e004b03748277 | 61ae0f4165c46dc923ff22a7 |  5.0  |  1   |
| 61ba7d99e30e004b03748277 | 61ae0f4165c46dc923ff229c |  5.0  |  2   |
| 61ba7d99e30e004b03748277 | 61ae0f4165c46dc923ff2280 |  5.0  |  3   |
| 61ba

In [None]:
#After building a popularity model, we will now build a collaborative filtering model
# Let’s train the item similarity model and make top 5 recommendations for the first 5 users
#Training the model
item_sim_model = turicreate.item_similarity_recommender.create(train_data, user_id='user', item_id='product', target='rating', similarity_type='cosine')

In [None]:
#Making recommendations
x='61ba7d6ee30e004b03748269'
item_sim_recomm = item_sim_model.recommend(users=[x],k=4)
item_sim_recomm.print_rows(num_rows=25)

+--------------------------+--------------------------+--------------------+------+
|           user           |         product          |       score        | rank |
+--------------------------+--------------------------+--------------------+------+
| 61ba7d6ee30e004b03748269 | 61ae0f4165c46dc923ff22a8 | 2.8224687576293945 |  1   |
| 61ba7d6ee30e004b03748269 | 61ae0f4165c46dc923ff2297 | 2.7142856121063232 |  2   |
| 61ba7d6ee30e004b03748269 | 61ae0f4165c46dc923ff22b5 | 2.6945323944091797 |  3   |
| 61ba7d6ee30e004b03748269 | 61ae0f4165c46dc923ff228f | 2.654017686843872  |  4   |
+--------------------------+--------------------------+--------------------+------+
[4 rows x 4 columns]

