In [42]:
# import libraries
import os

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform
from copy import deepcopy

In [2]:
# read dataset
data_path = Path("data")
data_file_movies = os.path.join(data_path, 'movies.csv')

movies = pd.read_csv(data_file_movies)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# read dataset
data_path = Path("data")
data_file_ratings = os.path.join(data_path, 'ratings.csv')

ratings = pd.read_csv(data_file_ratings)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# Number of users
print('The ratings dataset has', ratings['userId'].nunique(), 'unique users')

# Number of movies
print('The ratings dataset has', ratings['movieId'].nunique(), 'unique movies')

# Number of ratings
print('The ratings dataset has', ratings['rating'].nunique(), 'unique ratings')

# List of unique ratings
print('The unique ratings are', sorted(ratings['rating'].unique()))

The ratings dataset has 610 unique users
The ratings dataset has 9724 unique movies
The ratings dataset has 10 unique ratings
The unique ratings are [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]


In [4]:
# Merge ratings and movies datasets
df = pd.merge(ratings, movies, on='movieId', how='inner')
# Take a look at the data
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [5]:
# Aggregate by movie
agg_ratings = df.groupby('title').agg(mean_rating = ('rating', 'mean'),
                                                number_of_ratings = ('rating', 'count')).reset_index()

agg_ratings

Unnamed: 0,title,mean_rating,number_of_ratings
0,'71 (2014),4.000000,1
1,'Hellboy': The Seeds of Creation (2004),4.000000,1
2,'Round Midnight (1986),3.500000,2
3,'Salem's Lot (2004),5.000000,1
4,'Til There Was You (1997),4.000000,2
...,...,...,...
9714,eXistenZ (1999),3.863636,22
9715,xXx (2002),2.770833,24
9716,xXx: State of the Union (2005),2.000000,5
9717,¡Three Amigos! (1986),3.134615,26


In [6]:
# Keep the movies with over 100 ratings
agg_ratings_GT100 = agg_ratings[agg_ratings['number_of_ratings']>100]
agg_ratings_GT100

Unnamed: 0,title,mean_rating,number_of_ratings
74,2001: A Space Odyssey (1968),3.894495,109
207,Ace Ventura: Pet Detective (1994),3.040373,161
298,Aladdin (1992),3.792350,183
327,Alien (1979),3.969178,146
333,Aliens (1986),3.964286,126
...,...,...,...
9119,"Usual Suspects, The (1995)",4.237745,204
9215,WALL·E (2008),4.057692,104
9298,Waterworld (1995),2.913043,115
9485,Willy Wonka & the Chocolate Factory (1971),3.873950,119


In [7]:
# Merge data
df_GT100 = pd.merge(df, agg_ratings_GT100[['title']], on='title', how='inner')
df_GT100.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19788 entries, 0 to 19787
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     19788 non-null  int64  
 1   movieId    19788 non-null  int64  
 2   rating     19788 non-null  float64
 3   timestamp  19788 non-null  int64  
 4   title      19788 non-null  object 
 5   genres     19788 non-null  object 
dtypes: float64(1), int64(3), object(2)
memory usage: 1.1+ MB


In [None]:
# Number of users
print('The ratings dataset has', df_GT100['userId'].nunique(), 'unique users')

# Number of movies
print('The ratings dataset has', df_GT100['movieId'].nunique(), 'unique movies')

# Number of ratings
print('The ratings dataset has', df_GT100['rating'].nunique(), 'unique ratings')

# List of unique ratings
print('The unique ratings are', sorted(df_GT100['rating'].unique()))

In [40]:
# Create user-item matrix
matrix_data = df_GT100.pivot_table(index='userId', columns='title', values='rating')
matrix_data.head()

title,2001: A Space Odyssey (1968),Ace Ventura: Pet Detective (1994),Aladdin (1992),Alien (1979),Aliens (1986),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),Apocalypse Now (1979),...,True Lies (1994),"Truman Show, The (1998)",Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Twister (1996),Up (2009),"Usual Suspects, The (1995)",WALL·E (2008),Waterworld (1995),Willy Wonka & the Chocolate Factory (1971),X-Men (2000)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,4.0,,,5.0,5.0,,4.0,...,,,,3.0,,5.0,,,5.0,5.0
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,4.0,,,,5.0,,,,...,,,2.0,,,,,,4.0,
5,,3.0,4.0,,,,,,,,...,2.0,,,,,4.0,,,,


In [41]:
def recommendataion_system_user_based(user_id, n):
    matrix_user = matrix_data.fillna(value=0)
    matrix_user = matrix_user.T.corr()

    user_similarity = matrix_user.iloc[:,user_id].drop(user_id).sort_values(ascending=False)[:10].index

    user_films_hasnt_watched = matrix_data.iloc[(user_id-1),:].dropna(axis=0).index

    matrix = matrix_data[matrix_data.index.isin(user_similarity)].drop(labels=user_films_hasnt_watched, axis=1).fillna(value=0)

    
    item_score = {}

    for i in matrix.columns:
        film_scores = matrix[i]

        scores = []
        average_scores = []
        for u in film_scores.index:
            if film_scores[u] != 0:
                scores1 = film_scores[u]
                scores.append(scores1)
        item_score[i] = np.average(scores)

    # Convert dictionary to pandas dataframe
    item_score = pd.DataFrame(item_score.items(), columns=['movie', 'movie_score'])

    # Sort the movies by score
    ranked_item_score = item_score.sort_values(by='movie_score', ascending=False)
    # Select top m movies
    m = 10
    return ranked_item_score.head(m)

recommendataion_system_user_based(1,10)

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,movie,movie_score
55,Ocean's Eleven (2001),5.0
46,"Lion King, The (1994)",5.0
7,Austin Powers: The Spy Who Shagged Me (1999),5.0
14,"Bourne Identity, The (2002)",5.0
16,Catch Me If You Can (2002),5.0
34,"Godfather: Part II, The (1974)",5.0
70,Titanic (1997),5.0
69,There's Something About Mary (1998),5.0
21,"Dark Knight, The (2008)",4.785714
33,"Godfather, The (1972)",4.75
