In [1]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings('ignore')

# Loading and Exploring Data 

In [2]:
ratings= pd.read_csv('../data/ml-100k/u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
movies= pd.read_csv('../data/ml-100k/u.item', sep='|', encoding='latin-1', names=['movie_id', 'title'], usecols=[0, 1])
movies.head()

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
print(ratings.shape) 
print(movies.shape)

(100000, 4)
(1682, 2)


# User Based Filtering 

## Build User Item Matrix 

In [5]:
# Using pivot to get users as rows so we can compare them,we need to replace Nans with zero[ no rating] becasue cosine_similarity can't do math with NaN
user_item_matrix=ratings.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
user_item_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Compute User Similarity ( has values between zero[not similar] and one[identical])
similar_users= cosine_similarity(user_item_matrix)

#Convert the similarity matrix into a DataFrame 
similar_users_df= pd.DataFrame(
            similar_users, 
    index= user_item_matrix.index,
    columns= user_item_matrix.index
)

similar_users_df.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
2,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
3,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
4,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
5,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


# Generate Recommendations 

In [9]:
def get_similar_users(user_id, similar_users_df): 
    top_ten_similar= similar_users_df.loc[user_id].sort_values(ascending= False)[1:11]
    return top_ten_similar

In [14]:
def recommend_movies(user_id, num_recommendations, users_df= similar_users_df, ratings_df= ratings, movies_df= movies):
    top_ten_similar= get_similar_users(user_id, users_df)

    # Filter ratings from top ten similar users 
    filtered_ratings= ratings_df[ratings_df['user_id'].isin(top_ten_similar.index)].copy() 

    #Add Similarity Scores 
    filtered_ratings['similarity']= filtered_ratings['user_id'].map(top_ten_similar)
    #Calculate Weighted Ratings [We multiply rating by similarity because we want to trust the opinions of people similar to us ]
    filtered_ratings['weighted_ratings']= filtered_ratings['rating'] * filtered_ratings['similarity']

    #Aggregate by Movie
    movie_groups= filtered_ratings.groupby('movie_id').agg(
        {
            'weighted_ratings':'sum',
            'similarity':'sum'
        }
    )
    
    #We divide the sum of weighted ratings by the sum of similarities to get a rating that accounts for how similar each person is to us
    movie_groups['predicted_ratings']= movie_groups['weighted_ratings'] / movie_groups['similarity']

    #Get User's Already-Watched Movies [finds all the movies that the user has already watched/rated, so we don't recommend them again]
    users_watched_movies= ratings_df[ratings_df['user_id']== user_id]['movie_id']

    # print(filtered_ratings)


   

In [15]:
recommend_movies(1, 5)

       user_id  movie_id  rating  timestamp  similarity  weighted_ratings
15         303       785       3  879485318    0.525718          1.577153
38         276       796       1  874791932    0.524523          0.524523
43         276       564       3  874791805    0.524523          1.573568
72          92      1049       1  890251826    0.540534          0.540534
83         276        54       3  874791025    0.524523          1.573568
...        ...       ...     ...        ...         ...               ...
99839      303       363       1  879485134    0.525718          0.525718
99843      916       148       2  880843892    0.569066          1.138131
99963      429       199       5  882386006    0.525950          2.629750
99980      864       685       4  888891900    0.547548          2.190193
99997      276      1090       1  874795795    0.524523          0.524523

[3547 rows x 6 columns]
