In [1]:
# import stuff
import pandas as pd

# open movies and ratings csv
df = pd.read_csv("movies.csv")
df2 = pd.read_csv("ratings.csv")

In [2]:
# movies
df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [3]:
# ratings 
df2

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [5]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [6]:
# grouping by userid to see how many users there are
df2.groupby("userId").count()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,232,232,232
2,29,29,29
3,39,39,39
4,216,216,216
5,44,44,44
...,...,...,...
606,1115,1115,1115
607,187,187,187
608,831,831,831
609,37,37,37


In [7]:
# There are 610 unique users. I'll pick one random one to be the user input
import random

chosen_user = random.sample(range(1, 611), 1)

print(chosen_user)

[128]


In [8]:
# getting the user input based on the randomly selected user
user_input = df2[df2["userId"] == chosen_user[0]][["movieId", "rating"]]

user_input

Unnamed: 0,movieId,rating
19649,110,5.0
19650,260,5.0
19651,593,5.0
19652,608,5.0
19653,898,5.0
19654,904,5.0
19655,905,5.0
19656,908,5.0
19657,911,4.0
19658,912,5.0


In [9]:
# taking all rows from the movies dataframe where the movie id is one of the movie ids that the chosen user rated
input_title = df[df["movieId"].isin(user_input["movieId"].tolist())]

# merging the user input and input title
input_df = pd.merge(user_input, input_title)

input_df

Unnamed: 0,movieId,rating,title,genres
0,110,5.0,Braveheart (1995),Action|Drama|War
1,260,5.0,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
2,593,5.0,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
3,608,5.0,Fargo (1996),Comedy|Crime|Drama|Thriller
4,898,5.0,"Philadelphia Story, The (1940)",Comedy|Drama|Romance
5,904,5.0,Rear Window (1954),Mystery|Thriller
6,905,5.0,It Happened One Night (1934),Comedy|Romance
7,908,5.0,North by Northwest (1959),Action|Adventure|Mystery|Romance|Thriller
8,911,4.0,Charade (1963),Comedy|Crime|Mystery|Romance|Thriller
9,912,5.0,Casablanca (1942),Drama|Romance


In [10]:
# creates the user subset by taking all rows from the ratings dataframe where the movie id is one of the movies that the chosen user rated
user_subset = df2[df2["movieId"].isin(input_df["movieId"].tolist())]

user_subset

Unnamed: 0,userId,movieId,rating,timestamp
7,1,110,4.0,964982176
15,1,260,5.0,964981680
34,1,593,4.0,964983793
36,1,608,5.0,964982931
45,1,923,5.0,964981529
...,...,...,...,...
99607,610,1196,5.0,1479544565
99609,610,1198,5.0,1479545833
99623,610,1228,5.0,1479542176
99640,610,1291,4.5,1493850234


In [11]:
# shows the number of movies being considered, and the number of ratings for the movies
user_subset.groupby("movieId").count()

Unnamed: 0_level_0,userId,rating,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
110,237,237,237
260,251,251,251
593,279,279,279
608,181,181,181
898,29,29,29
904,84,84,84
905,14,14,14
908,57,57,57
911,13,13,13
912,100,100,100


In [12]:
# create user subset group by grouping the user subset by their user id
user_subset_group = user_subset.groupby(['userId'])

# function to return the number of movies that x has in common with the user input (that both have rated)
def find_same_movies(x):
    return len(x[1])

# sort from most shared movies to least
user_subset_group = sorted(user_subset_group, key = find_same_movies, reverse = True)

# take the 100 users with the most shared movies to the input
user_subset_group = user_subset_group[0:100]

user_subset_group

[((128,),
         userId  movieId  rating  timestamp
  19649     128      110     5.0  899032847
  19650     128      260     5.0  899033269
  19651     128      593     5.0  899032763
  19652     128      608     5.0  899033437
  19653     128      898     5.0  899032763
  19654     128      904     5.0  899033229
  19655     128      905     5.0  899033229
  19656     128      908     5.0  899032847
  19657     128      911     4.0  899033506
  19658     128      912     5.0  899032763
  19659     128      923     4.0  899033269
  19660     128      926     5.0  899033362
  19661     128      935     5.0  899032806
  19662     128      940     5.0  899032847
  19663     128      942     4.0  899033506
  19664     128      943     4.0  899033565
  19665     128      945     5.0  899033565
  19666     128      965     4.0  899033362
  19667     128     1036     3.0  899033127
  19668     128     1066     5.0  899032763
  19669     128     1080     4.0  899033506
  19670     128     11

In [13]:
from math import sqrt

# dictionary for user id and pearson correlation coefficient
pearson_correlation_dict = {}

for name, group in user_subset_group:
    # sync movie id by sorting
    group = group.sort_values(by = "movieId")
    user_input = user_input.sort_values(by = "movieId")

    # n for mean calculation
    n = len(group)

    # creates a temporary dataframe of the input user's ratings on all the movies that are rated both by the user and the current selected user from the loop
    tmp_df = user_input[user_input["movieId"].isin(group["movieId"])]

    # create two temporary lists, one for the ratings of the input user and one for the ratings of the currently selected user
    tmp_rating_list = tmp_df['rating'].tolist()
    tmp_group_list = group['rating'].tolist()

    # the pearson correlation coefficient can be calculated using Sxy/sqrt(Sxx * Syy)
    # where Sxx is the sum of squares of all differences between each rating from the input user and the mean of all of that user's ratings of those movies
    # Syy is the sum of squares of all differences between each rating from the currently selected and the mean of all of that user's ratings of those movies
    # Sxy is the sum each difference from Sxx multiplied by each difference from Syy (Unlike in Sxx and Syy, these differences aren't squared)
    Sxx = 0
    rate_mean = sum(tmp_rating_list) / float(n)
    group_mean = sum(tmp_group_list) / float(n)
    for i in tmp_rating_list:
        Sxx += pow((i - rate_mean), 2)

    Syy = 0
    for i in tmp_group_list:
        Syy += pow((i - group_mean), 2)

    Sxy = 0
    for i, j in zip(tmp_rating_list, tmp_group_list):
        Sxy += ((i - rate_mean) * (j - group_mean))

    # If either Sxx or Syy is 0, then the pearson correlation coefficient. If this isn't done, the equation will be something divided by 0
    if Sxx != 0 and Syy != 0:
        correlation_coefficient = Sxy/sqrt(Sxx * Syy)
    else: 
        correlation_coefficient = 0

    # put the correlation coefficient of the current selected user into the dictionary
    pearson_correlation_dict[name] = correlation_coefficient

print(pearson_correlation_dict)

{(128,): 1.0, (474,): -0.02103273853916943, (590,): 0.2665270698149394, (603,): 0.019480930197663154, (57,): 0.32717050683174453, (387,): -0.024509803921568627, (414,): 0.21119430154775576, (469,): -0.10727549415913698, (599,): -0.04886208116844532, (156,): 0.19200108048362632, (177,): 0.2667460400610533, (91,): -0.037871060667797864, (221,): 0.012301463084634387, (274,): -0.22411369205400508, (288,): 0.0024404475527393006, (606,): -0.05681909897271365, (182,): 0.27241083226602286, (307,): 0.3002005795646487, (380,): 0.05754409750538379, (477,): -0.05252692037926169, (18,): 0.0869191869206394, (68,): 0.23359385790031276, (84,): 0.5689252138295514, (290,): 0.25677629550654774, (462,): 0.2751697068483099, (580,): -0.3571246369629027, (249,): 0.12557028534887762, (391,): 0.03573708449459316, (448,): 0.16235226246494966, (489,): 0.3310886388882815, (600,): 0.03464794643376185, (199,): -0.2606800811700246, (202,): 0.33932078148933503, (480,): 0.5273232467473198, (597,): 0.06108472217815256,

In [14]:
# create a dataframe based on the data from the dictionary, then name its columns, make the user id column, and set the index
pearson_df = pd.DataFrame.from_dict(pearson_correlation_dict, orient='index')
pearson_df.columns = ['similarityIndex']
pearson_df['userId'] = pearson_df.index
pearson_df.index = range(len(pearson_df))

pearson_df

Unnamed: 0,similarityIndex,userId
0,1.000000,"(128,)"
1,-0.021033,"(474,)"
2,0.266527,"(590,)"
3,0.019481,"(603,)"
4,0.327171,"(57,)"
...,...,...
95,0.000000,"(220,)"
96,0.443203,"(265,)"
97,0.449467,"(367,)"
98,0.223152,"(527,)"


In [15]:
# I'm taking 1 to 51 because 0 will have a similarity index of 1, which is the user that was randomly selected as the user input
top_users_df = pearson_df.sort_values(by = 'similarityIndex', ascending = False)[1:51]
top_users_df

Unnamed: 0,similarityIndex,userId
86,0.625308,"(517,)"
75,0.597614,"(560,)"
67,0.579178,"(160,)"
22,0.568925,"(84,)"
33,0.527323,"(480,)"
49,0.491774,"(4,)"
72,0.45257,"(385,)"
97,0.449467,"(367,)"
96,0.443203,"(265,)"
46,0.443014,"(483,)"


In [16]:
# the user id in the top users dataframe is a tuple and not an integer, so I'm using a lambda function to take the integer out
top_users_df['userId'] = top_users_df['userId'].apply(lambda x: x[0])

# merge the original ratings dataframe and the top users dataframe on their user id
top_users_rating_df = top_users_df.merge(df2, left_on = 'userId', right_on = 'userId', how = 'inner')

top_users_rating_df

Unnamed: 0,similarityIndex,userId,movieId,rating,timestamp
0,0.625308,517,1,4.0,1487954343
1,0.625308,517,2,3.0,1487954340
2,0.625308,517,10,0.5,1487957717
3,0.625308,517,17,0.5,1487953834
4,0.625308,517,34,5.0,1487954303
...,...,...,...,...,...
25110,0.158114,28,70336,1.5,1277002289
25111,0.158114,28,71106,2.5,1277001836
25112,0.158114,28,71135,3.5,1277001895
25113,0.158114,28,72378,2.0,1277002257


In [17]:
# create a column named weightedRating and calculate the values for the column
# the value of this column is the similarity index multiplied by the rating 
top_users_rating_df["weightedRating"] = top_users_rating_df["similarityIndex"] * top_users_rating_df["rating"]

top_users_rating_df

Unnamed: 0,similarityIndex,userId,movieId,rating,timestamp,weightedRating
0,0.625308,517,1,4.0,1487954343,2.501234
1,0.625308,517,2,3.0,1487954340,1.875925
2,0.625308,517,10,0.5,1487957717,0.312654
3,0.625308,517,17,0.5,1487953834,0.312654
4,0.625308,517,34,5.0,1487954303,3.126542
...,...,...,...,...,...,...
25110,0.158114,28,70336,1.5,1277002289,0.237171
25111,0.158114,28,71106,2.5,1277001836,0.395285
25112,0.158114,28,71135,3.5,1277001895,0.553399
25113,0.158114,28,72378,2.0,1277002257,0.316228


In [18]:
# sum up the similarity index and weighted rating for each movie
tmp_top_users_rating_df = top_users_rating_df.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tmp_top_users_rating_df.columns = ['sum_similarityIndex','sum_weightedRating']

tmp_top_users_rating_df

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,12.577633,47.312704
2,7.566962,23.980496
3,3.304471,9.191994
4,0.568925,1.706776
5,2.642495,6.663575
...,...,...
188189,0.275170,1.238264
188301,0.368856,1.106567
188797,0.323788,1.295152
189713,0.275170,0.687924


In [19]:
# create a new dataframe for the reccomendation
recommendation_df = pd.DataFrame()

# create a column for the weighted average recommendation score, which is the sum of weighted ratings divided by the sum of similarity indexes for each movie
recommendation_df['weighted average recommendation score'] = tmp_top_users_rating_df['sum_weightedRating']/tmp_top_users_rating_df['sum_similarityIndex']
recommendation_df['movieId'] = tmp_top_users_rating_df.index

recommendation_df

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.761654,1
2,3.169105,2
3,2.781684,3
4,3.000000,4
5,2.521698,5
...,...,...
188189,4.500000,188189
188301,3.000000,188301
188797,4.000000,188797
189713,2.500000,189713


In [20]:
# sort in descending order to get the movies with the highest weighted average recommendation scores first
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending = False)

recommendation_df

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
304,5.0,304
4142,5.0,4142
27803,5.0,27803
32582,5.0,32582
161582,5.0,161582
...,...,...
53453,0.5,53453
5784,0.5,5784
137517,0.5,137517
50798,0.5,50798


In [21]:
# since the dataframe will be extremely long if all those columns are shown, recommend only the movies with a weighted average recommendation score of 5.0
recommendation_df = recommendation_df[recommendation_df["weighted average recommendation score"] == 5.0]
recommendation_df

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
27803,5.0,27803
32582,5.0,32582
161582,5.0,161582
162414,5.0,162414
3851,5.0,3851
...,...,...
1949,5.0,1949
102217,5.0,102217
1354,5.0,1354
3677,5.0,3677


In [22]:
# take rows from the original movies dataframe where the movie id is one of the recommended ones
recommended_movie_df = df.loc[df['movieId'].isin(recommendation_df['movieId'])]

# remove movies that the user input hsa already rated
recommended_movie_df = recommended_movie_df.loc[~recommended_movie_df.movieId.isin(user_subset['movieId'])]

# final result
recommended_movie_df

Unnamed: 0,movieId,title,genres
198,232,Eat Drink Man Woman (Yin shi nan nu) (1994),Comedy|Drama|Romance
259,299,Priest (1994),Drama
281,322,Swimming with Sharks (1995),Comedy|Drama
388,446,Farewell My Concubine (Ba wang bie ji) (1993),Drama|Romance
717,936,Ninotchka (1939),Comedy|Romance
...,...,...,...
9357,161582,Hell or High Water (2016),Crime|Drama
9367,162414,Moonlight,Drama
9443,167064,I Am Not Your Negro (2017),Documentary
9466,168326,The Big Sick (2017),Comedy|Romance


In [23]:
# note: since the user input is randomly selected, each run through all the blocks will produce different results, but should work just fine since the selected user is removed when the similiarity index is calculated