In [81]:
import pandas as pd
from math import sqrt
import numpy as np

In [82]:
movies_df = pd.read_csv('/Users/andrewsebastian/Downloads/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('/Users/andrewsebastian/Downloads/ml-latest-small/ratings.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [83]:
userInput = [{'title':'Imperium (2016)', 'rating':2.5},
            {'title':'Bottle Rocket (1996)', 'rating':5.0},
            {'title':'Scream 3 (2000)', 'rating':5.0},
            {'title':'Aladdin (1992)', 'rating':4.0},
            {'title':'State and Main (2000)', 'rating':3.0}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                   title  rating
0        Imperium (2016)     2.5
1   Bottle Rocket (1996)     5.0
2        Scream 3 (2000)     5.0
3         Aladdin (1992)     4.0
4  State and Main (2000)     3.0


In [84]:
inputMovies = pd.DataFrame(userInput)

# Filter movies by title in userInput
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]

# Drop unnecessary columns
inputId = inputId.drop(['genres'], axis=1)

# Merge dataframes
inputMovies = pd.merge(inputId, inputMovies, on='title')

# Select relevant columns
inputMovies = inputMovies[['movieId', 'title', 'rating']]

print(inputMovies)

   movieId                  title  rating
0      101   Bottle Rocket (1996)     5.0
1      588         Aladdin (1992)     4.0
2     3273        Scream 3 (2000)     5.0
3     4029  State and Main (2000)     3.0
4   162828        Imperium (2016)     2.5


In [85]:
userSubset = ratings_df [ratings_df ['movieId'].isin(inputMovies ['movieId'].tolist ())]
print(userSubset.groupby('movieId'). count ())

         userId  rating  timestamp
movieId                           
101          23      23         23
588         183     183        183
3273         29      29         29
4029         12      12         12
162828        1       1          1


In [86]:
# Groupby creates several sub dataframes where they all have the same value in the column specified as the paramet
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print(len(x[1]))
    return len(x[1])

# Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)
userSubsetGroup = userSubsetGroup[0: 100]
print(userSubsetGroup[0:5])

[(414,        userId  movieId  rating  timestamp
62341     414      101     4.0  961438199
62552     414      588     4.0  961438199
63378     414     3273     3.0  963236640
63585     414     4029     5.0  993482948), (474,        userId  movieId  rating   timestamp
73130     474      101     3.5  1081862489
73241     474      588     4.0   978576063
74035     474     3273     4.0   975173316
74201     474     4029     4.5  1100291592), (249,        userId  movieId  rating   timestamp
36384     249      101     3.0  1388146126
36419     249      588     4.0  1353800716
36593     249     3273     2.5  1346752311), (274,        userId  movieId  rating   timestamp
39258     274      101     3.5  1250200353
39338     274      588     4.0  1171934614
39761     274     3273     2.5  1171787271), (307,        userId  movieId  rating   timestamp
46755     307      101     4.5  1186087030
46841     307      588     4.0  1186084593
47175     307     3273     0.5  1186083439)]


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [87]:
# Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

# For every user group in our subset
for name, group in userSubsetGroup:
    # Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    
    # Get the N for the formula
    nRatings = len(group)
    
    # Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    
    # And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    
    # Put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    
    # Calculate the pearson correlation between two users, so-called, x and y manually
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList), 2) / float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList), 2) / float(nRatings)
    Sxy = sum(i * j for i, j in zip(tempRatingList, tempGroupList)) - (sum(tempRatingList) * sum(tempGroupList)) / float(nRatings)
    
    # If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy / sqrt(Sxx * Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [88]:
pearsonDF = pd. DataFrame. from_dict (pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF ['userId'] = pearsonDF. index
pearsonDF.index = range(len (pearsonDF))
print (pearsonDF.head ( ))

   similarityIndex  userId
0        -0.852803     414
1        -0.852803     474
2        -0.944911     249
3        -0.755929     274
4        -0.397360     307


In [89]:
topUsers = pearsonDF.sort_values(by='similarityIndex', ascending=False).head(50)
print(topUsers.head())

    similarityIndex  userId
25              1.0     600
24              1.0     597
9               1.0       4
21              1.0     480
11              1.0      20


In [90]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating. head (100))

    similarityIndex  userId  movieId  rating   timestamp
0               1.0     600        1     2.5  1237764347
1               1.0     600        2     4.0  1237764627
2               1.0     600        4     1.5  1237760055
3               1.0     600        5     2.5  1237759452
4               1.0     600        7     3.5  1237851387
..              ...     ...      ...     ...         ...
95              1.0     600      539     2.0  1237742333
96              1.0     600      543     2.5  1237851366
97              1.0     600      551     4.0  1237763405
98              1.0     600      552     3.5  1237713516
99              1.0     600      558     2.0  1237760744

[100 rows x 5 columns]


In [91]:
#Multiplies the similarity by the user's ratings
topUsersRating ['weightedRating'] = topUsersRating ['similarityIndex']*topUsersRating ['rating']
print(topUsersRating.head ())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0     600        1     2.5  1237764347             2.5
1              1.0     600        2     4.0  1237764627             4.0
2              1.0     600        4     1.5  1237760055             1.5
3              1.0     600        5     2.5  1237759452             2.5
4              1.0     600        7     3.5  1237851387             3.5


In [92]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex', 'weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex', 'sum _weightedRating']
print (tempTopUsersRating.head ())

         sum_similarityIndex  sum _weightedRating
movieId                                          
1                        4.0                 14.5
2                        5.0                 16.5
3                        2.0                  5.5
4                        1.0                  1.5
5                        2.0                  5.5


In [93]:
# Remove extra spaces in column names
tempTopUsersRating.columns = tempTopUsersRating.columns.str.replace(' ', '')

# Creates an empty dataframe
recommendation_df = pd.DataFrame()

# Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating'] / tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index

print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.625000        1
2                                     3.300000        2
3                                     2.750000        3
4                                     1.500000        4
5                                     2.750000        5
6                                     3.500000        6
7                                     2.250000        7
8                                     1.000000        8
10                                    3.666667       10
11                                    3.000000       11


In [94]:
# Drop rows with NaN values in the 'weighted average recommendation score' column
recommendation_df = recommendation_df.dropna(subset=['weighted average recommendation score'])

# Sort by 'weighted average recommendation score' in descending order
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)

# Print the DataFrame
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
1221                                       5.0     1221
132333                                     5.0   132333
322                                        5.0      322
3677                                       5.0     3677
933                                        5.0      933
...                                        ...      ...
4775                                       0.5     4775
7318                                       0.5     7318
125221                                     0.5   125221
125916                                     0.5   125916
3990                                       0.5     3990

[2951 rows x 2 columns]


In [97]:
# Extract the year from the 'title' column and create a new column 'year'
movies_df['year'] = movies_df['title'].str.extract(r'\((\d{4})\)')

# Remove the year from the 'title' column
movies_df['title_without_year'] = movies_df['title'].str.replace(r'\(\d{4}\)', '').str.strip()

# Now, you can use the modified DataFrame for further processing
recommended_movie = movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]
recommended_movie = recommended_movie.loc[~recommended_movie['movieId'].isin(userSubset['movieId'])]

# Print the resulting DataFrame
print(recommended_movie[['movieId', 'title_without_year', 'year']])

      movieId           title_without_year  year
0           1                    Toy Story  1995
1           2                      Jumanji  1995
2           3             Grumpier Old Men  1995
3           4            Waiting to Exhale  1995
4           5  Father of the Bride Part II  1995
...       ...                          ...   ...
9411   165347  Jack Reacher: Never Go Back  2016
9467   168350                  100 Streets  2016
9471   168456               Mercury Plains  2016
9479   169180               American Fable  2017
9564   173751                   Tiger Raid  2016

[2947 rows x 3 columns]


  movies_df['title_without_year'] = movies_df['title'].str.replace(r'\(\d{4}\)', '').str.strip()
