In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
from sklearn.metrics.pairwise import pairwise_distances

In [2]:
base_path = '..' #/Users/akshatpandey/Thesisproject/Movie recommendation/data/ratings.csv

In [3]:
# Reading ratings file
# Ignore the timestamp column
#sep and encoding is used because csv file is not in correct format


ratings = pd.read_csv(base_path+'//data//ratings.csv',sep='\t', encoding='latin-1',usecols=['user_id', 'movie_id', 'rating'])

# Reading users file
users = pd.read_csv(base_path+'//data//users.csv',sep='\t' ,encoding='latin-1',usecols=['user_id', 'gender', 'zipcode', 'age_desc', 'occ_desc'])

# Reading movies file
movies = pd.read_csv(base_path+'//data//movies.csv',sep='\t',encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

In [31]:
final_dataset = ratings.pivot(index='movie_id',columns='user_id',values='rating')
final_dataset.head(10)

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,4.0,,,4.0,,,,,3.0
2,,,,,,,,,,5.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,
6,,,,,2.0,,4.0,,,,...,,,,,,3.0,,,,
7,,,,,,,,,,4.0,...,,,,,3.0,,,,,
8,,,,,,,,,,,...,5.0,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [8]:
# Randomly sample 1% of the ratings dataset
small_data = ratings.sample(frac=0.02)
# Check the sample info
print(small_data.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20004 entries, 19248 to 267671
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   user_id   20004 non-null  int64
 1   movie_id  20004 non-null  int64
 2   rating    20004 non-null  int64
dtypes: int64(3)
memory usage: 625.1 KB
None


In [12]:
train_data, test_data = train_test_split(small_data, test_size=0.2)

In [16]:
train_data_matrix = train_data.values
test_data_matrix = test_data.values

In [35]:


# User Similarity Matrix
user_correlation = 1- pairwise_distances(train_data, metric='correlation')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation[:4, :4])

[[ 1.         -0.03929737  0.38431898  0.68122303]
 [-0.03929737  1.          0.90738453  0.70474019]
 [ 0.38431898  0.90738453  1.          0.93765972]
 [ 0.68122303  0.70474019  0.93765972  1.        ]]


In [27]:
# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation[:4, :4])

[[ 1.         -0.02583894  0.00993486]
 [-0.02583894  1.         -0.0696622 ]
 [ 0.00993486 -0.0696622   1.        ]]


In [39]:
# Function to predict ratings
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        # Use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred