# Collaborative Filtering Movie Recommender System

## Import Relevant Libraries

In [3]:
import numpy as np 
import pandas as pd
# for calculating Pearson correlation
from scipy.stats import pearsonr

## Load Datasets

In [5]:
rating_df = pd.read_csv("..\\data\\ratings_data.csv")
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [6]:
# The "timestamp" is not required for calculating user similarities in this collaborative filtering model.
rating_df = rating_df.drop(["timestamp"] , axis =1 )
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [7]:
## Movies Data Preprocessing and Cleaning

In [8]:
movies_df = pd.read_csv("..\\data\\movies-data.csv")
movies_df["year"] = movies_df["title"].str.extract("(\(\d\d\d\d\))")
movies_df["year"] = movies_df["year"].str.extract("(\d\d\d\d)")

movies_df["title"] = movies_df["title"].str.replace( "(\(\d\d\d\d\))","" , regex = True)
movies_df["title"] = movies_df["title"].str.strip()

movies_df = movies_df.drop(["genres"] , axis = 1)

movies_df.head()

  movies_df["year"] = movies_df["title"].str.extract("(\(\d\d\d\d\))")
  movies_df["year"] = movies_df["year"].str.extract("(\d\d\d\d)")
  movies_df["title"] = movies_df["title"].str.replace( "(\(\d\d\d\d\))","" , regex = True)


Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


## Defining  User Input

In [10]:
user_input = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
input_df = pd.DataFrame(user_input)
input_df

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


## Add movieId to input user

In [12]:
input_id = movies_df[movies_df["title"].isin(input_df["title"].to_list())]
user_movies = pd.merge( input_id, input_df)
user_movies = user_movies.drop(["year"] ,axis =1 )
user_movies.head()

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


## Identifying Users with Overlapping Movie Ratings
find all users who have rated at least one movie in common with our target user

In [14]:
user_subsets = rating_df[rating_df["movieId"].isin(user_movies["movieId"].to_list())]
user_subsets.head()

Unnamed: 0,userId,movieId,rating
19,4,296,4.0
441,12,1968,3.0
479,13,2,2.0
531,13,1274,5.0
681,14,296,2.0


In [15]:
# Group the 'user_subsets' DataFrame by 'userId'.
user_subsets_group = user_subsets.groupby("userId")
# Example for user (13)
user_subsets_group.get_group(13)

Unnamed: 0,userId,movieId,rating
479,13,2,2.0
531,13,1274,5.0


## Filtering and Prioritizing Similar Users

In [17]:
user_subsets_group = sorted(user_subsets_group , key = lambda x: len(x[1]) , reverse = True)
# Top 100 Users
user_subsets_group = user_subsets_group[:100]
user_subsets_group[:3]

[(75,
        userId  movieId  rating
  7507      75        1     5.0
  7508      75        2     3.5
  7540      75      296     5.0
  7633      75     1274     4.5
  7673      75     1968     5.0),
 (106,
        userId  movieId  rating
  9083     106        1     2.5
  9084     106        2     3.0
  9115     106      296     3.5
  9198     106     1274     3.0
  9238     106     1968     3.5),
 (686,
         userId  movieId  rating
  61336     686        1     4.0
  61337     686        2     3.0
  61377     686      296     4.0
  61478     686     1274     4.0
  61569     686     1968     5.0)]

## Calculating User Similarities with Pearson Correlation

In [19]:
#This section calculates the Pearson correlation coefficient between the target user and each of the identified overlapping users.
similarity_dict = {}
user_movie_ids = user_movies["movieId"].to_list()

for name , group in user_subsets_group:
    merged = group[group["movieId"].isin(user_movie_ids)].merge(
    user_movies, on="movieId" , suffixes=('_other', '_input'))
    user_ratings = merged["rating_input"].tolist()
    other_user_ratings = merged["rating_other"].tolist()

    sim , _ = pearsonr(user_ratings, other_user_ratings)
    similarity_dict[name] = sim
    
#print 10 userId and similarity scores 
count = 0
for key, value in similarity_dict.items():
    if count < 10:
        print(f"{key}: {value}")
        count += 1
    else:
        break

75: 0.827278151694757
106: 0.5860090386731193
686: 0.8320502943378437
815: 0.5765566601970551
1040: 0.9434563530497266
1130: 0.28915746598312014
1502: 0.8770580193070294
1599: 0.4385290096535146
1625: 0.7161148740394331
1950: 0.17902871850985827


## Filtering and Ranking Most Similar Users

In [21]:
# convert dictionary to DataFrame
similarity_df = pd.DataFrame.from_dict(similarity_dict, orient='index')
similarity_df.columns = ["similarity score"]
similarity_df["userId"] = similarity_df.index
similarity_df.index = range(len(similarity_df))
similarity_df.head()

Unnamed: 0,similarity score,userId
0,0.827278,75
1,0.586009,106
2,0.83205,686
3,0.576557,815
4,0.943456,1040


In [22]:
similarity_df = similarity_df.sort_values(by ="similarity score" , ascending = False)
similarity_df.head(10)

Unnamed: 0,similarity score,userId
64,0.961678,12325
55,0.961538,10707
34,0.961538,6207
67,0.960769,13053
4,0.943456,1040
59,0.937614,11769
62,0.929294,12120
80,0.903584,15157
17,0.895144,3040
70,0.895144,13366


## Merging and Calculating Weighted Ratings

In [24]:
top_users_rating = similarity_df.merge(rating_df , left_on = "userId" , right_on = "userId" , how = "inner")
top_users_rating.head()

Unnamed: 0,similarity score,userId,movieId,rating
0,0.961678,12325,1,3.5
1,0.961678,12325,2,1.5
2,0.961678,12325,3,3.0
3,0.961678,12325,5,0.5
4,0.961678,12325,6,2.5


In [25]:
top_users_rating["weighted_rating"] = top_users_rating["similarity score"] * top_users_rating["rating"]
top_users_rating.head()

Unnamed: 0,similarity score,userId,movieId,rating,weighted_rating
0,0.961678,12325,1,3.5,3.365874
1,0.961678,12325,2,1.5,1.442517
2,0.961678,12325,3,3.0,2.885035
3,0.961678,12325,5,0.5,0.480839
4,0.961678,12325,6,2.5,2.404196


### Calculating Sum of Similarity Scores and Sum of Weighted Rating

In [27]:
temp_df = top_users_rating.groupby("movieId").sum()[["similarity score" ,"weighted_rating"]]
temp_df.columns = ["sum_similarity_scores" , "sum_weighted_rating"]
temp_df.head()

Unnamed: 0_level_0,sum_similarity_scores,sum_weighted_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,47.110148,172.862258
2,47.110148,122.131292
3,14.570089,39.216419
4,0.454111,1.399461
5,13.899388,32.820691


## Calculating Final Movie Prediction Scores

In [29]:
recomendation_table = pd.DataFrame()
recomendation_table["wighted average score"] = temp_df["sum_weighted_rating"] / temp_df["sum_similarity_scores"]
recomendation_table["movieId"] = temp_df.index
recomendation_table.head()

Unnamed: 0_level_0,wighted average score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.669321,1
2,2.592463,2
3,2.69157,3
4,3.081759,4
5,2.361305,5


In [30]:
recomendation_table = recomendation_table.sort_values(by = "wighted average score" , ascending = False)
recomendation_table.head()

Unnamed: 0_level_0,wighted average score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4111,inf,4111
103655,78.928759,103655
3048,26.626059,3048
75803,23.203845,75803
57464,21.656752,57464


## Displaying Top Recommended Movies

In [32]:
movies_df.loc[movies_df["movieId"].isin(recomendation_table.head(10)["movieId"].to_list())]

Unnamed: 0,movieId,title,year
1724,1797,Everest,1998
2962,3048,Under the Rainbow,1981
3427,3516,"Bell, Book and Candle",1958
4018,4111,Gardens of Stone,1987
12399,57464,He Was a Quiet Man,2007
12760,60046,"Children of Huang Shi, The",2008
14599,72919,Did You Hear About the Morgans?,2009
15014,75803,Our Family Wedding,2010
19844,97860,Killing Them Softly,2012
21405,103655,R.I.P.D.,2013
