---
# Cairo University Faculty of Engineering
## Machine Learning
## Assignment 7

---
Please write your full name here
- **Name** : Ibrahim Mohamed

In [138]:
import numpy as np
import pandas as pd

### 1. Read the data

In [139]:
# import the data
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')

### 2. Exploring the data

In [140]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [141]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### 3. Data Preprocessing

In [142]:
# drop the timestamp column
ratings.drop('timestamp', axis=1, inplace=True)

# drop the genres column
movies.drop('genres', axis=1, inplace=True)

In [143]:
print('Number of users = ', ratings.userId.nunique())
print('Number of movies = ', ratings.movieId.nunique())

Number of users =  610
Number of movies =  9724


In [144]:
# get the first 200 users
ratings = ratings[ratings.userId <= 200]

In [145]:
# merge the two dataframes
movies_ratings = pd.merge(ratings, movies, on='movieId')

In [146]:
movies_ratings.head()

Unnamed: 0,userId,movieId,rating,title
0,1,1,4.0,Toy Story (1995)
1,5,1,4.0,Toy Story (1995)
2,7,1,4.5,Toy Story (1995)
3,15,1,2.5,Toy Story (1995)
4,17,1,4.5,Toy Story (1995)


We need to take care as these ratings are not accurate because some movies have few ratings so we need to make sure that we are not considering movies with few ratings


In [147]:
# Get the number of reviews given to each movie
top_rated = movies_ratings.groupby('movieId').size().reset_index(name='rating_count')

# Filter movies with over 20 ratings in descending order
top_rated = top_rated[top_rated['rating_count'] > 20]
top_rated.sort_values('rating_count', ascending=False, inplace=True)

In [148]:
movies_ratings = movies_ratings[movies_ratings['movieId'].isin(top_rated['movieId'])]

In [149]:
print('Number of users = ', movies_ratings.userId.nunique())
print('Number of movies = ', movies_ratings.movieId.nunique())

Number of users =  198
Number of movies =  287


### Get the Similarity Score between the users & movies


In [150]:
# create a pivot table to get the ratings of each user for each movie
pivot_table = movies_ratings.pivot_table(index='userId', columns='movieId', values='rating')
pivot_table.head()

movieId,1,2,6,7,10,11,16,17,19,21,...,59315,60069,68157,68954,69122,72998,79132,91529,99114,109487
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,4.5,,,,4.0,3.5,3.5,3.0
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,3.0,...,,,,,,,,,,
5,4.0,,,,,,,,,4.0,...,,,,,,,,,,


In [151]:
# fill the missing values with 0
pivot_table = pivot_table.fillna(0)

In [152]:
from sklearn.metrics.pairwise import cosine_similarity

# get the cosine similarity between the users
user_cosine_similarity = cosine_similarity(pivot_table)

# get the cosine similarity between the movies
movies_cosine_similarity = cosine_similarity(pivot_table.T)

In [153]:
# convert the cosine similarity to a dataframe to see the similarity scores between the users
user_cosine_similarity_df = pd.DataFrame(user_cosine_similarity, index=pivot_table.index, columns=pivot_table.index)
movies_cosine_similarity_df = pd.DataFrame(movies_cosine_similarity, index=pivot_table.columns, columns=pivot_table.columns)

user_cosine_similarity_df.head(10)


userId,1,2,3,4,5,6,7,8,9,10,...,191,192,193,194,195,196,197,198,199,200
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.037354,0.085398,0.445768,0.248501,0.272933,0.283794,0.23154,0.163338,0.057914,...,0.183733,0.177892,0.26473,0.107127,0.417716,0.165295,0.237922,0.507726,0.464626,0.35442
2,0.037354,1.0,0.0,0.011914,0.032843,0.032521,0.023597,0.047312,0.0,0.217943,...,0.042264,0.0,0.041582,0.0,0.111246,0.213271,0.047852,0.085147,0.103454,0.155851
3,0.085398,0.0,1.0,0.0,0.166852,0.059479,0.0,0.144217,0.0,0.0,...,0.0,0.0,0.144857,0.0,0.083045,0.0,0.0,0.069213,0.08925,0.0
4,0.445768,0.011914,0.0,1.0,0.143811,0.166084,0.251298,0.124302,0.033719,0.103929,...,0.109169,0.0445,0.2341,0.105946,0.407147,0.082855,0.130666,0.413061,0.471924,0.318739
5,0.248501,0.032843,0.166852,0.143811,1.0,0.541198,0.146181,0.523611,0.0,0.075837,...,0.395522,0.253008,0.141393,0.0,0.199531,0.142444,0.206872,0.221728,0.246605,0.194297
6,0.272933,0.032521,0.059479,0.166084,0.541198,1.0,0.169422,0.63362,0.0,0.076525,...,0.418897,0.498027,0.109853,0.107003,0.209103,0.163423,0.143152,0.269644,0.273918,0.25348
7,0.283794,0.023597,0.0,0.251298,0.146181,0.169422,1.0,0.156227,0.203362,0.325378,...,0.081317,0.169393,0.282111,0.0,0.267463,0.112758,0.160529,0.34277,0.340121,0.487154
8,0.23154,0.047312,0.144217,0.124302,0.523611,0.63362,0.156227,1.0,0.0,0.052023,...,0.358589,0.488178,0.176528,0.0,0.188033,0.140103,0.193534,0.259524,0.247131,0.235028
9,0.163338,0.0,0.0,0.033719,0.0,0.0,0.203362,0.0,1.0,0.13397,...,0.059808,0.0,0.08406,0.0,0.071323,0.051249,0.0,0.207247,0.126786,0.200431
10,0.057914,0.217943,0.0,0.103929,0.075837,0.076525,0.325378,0.052023,0.13397,1.0,...,0.011618,0.056633,0.090138,0.0,0.134805,0.202429,0.093396,0.150426,0.103668,0.366869


In [154]:
movies_cosine_similarity_df.head(10)

movieId,1,2,6,7,10,11,16,17,19,21,...,59315,60069,68157,68954,69122,72998,79132,91529,99114,109487
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.364634,0.327928,0.270701,0.352899,0.245691,0.240085,0.309381,0.383177,0.314843,...,0.234873,0.199627,0.210979,0.246652,0.228514,0.228129,0.244432,0.250658,0.242432,0.209214
2,0.364634,1.0,0.278936,0.240488,0.34259,0.334386,0.216116,0.144009,0.344587,0.219876,...,0.234106,0.214452,0.224016,0.307064,0.286514,0.321419,0.254434,0.268707,0.336075,0.262833
6,0.327928,0.278936,1.0,0.275939,0.304328,0.358692,0.500971,0.277037,0.281046,0.438093,...,0.191202,0.163585,0.220964,0.152767,0.193568,0.226812,0.192702,0.231025,0.190655,0.170448
7,0.270701,0.240488,0.275939,1.0,0.263707,0.428225,0.140414,0.293515,0.253973,0.348902,...,0.04053,0.027472,0.03733,0.024964,0.070391,0.03623,0.032816,0.033671,0.026482,0.0
10,0.352899,0.34259,0.304328,0.263707,1.0,0.345764,0.269904,0.292537,0.352139,0.219544,...,0.141756,0.10643,0.078231,0.14432,0.143295,0.149755,0.105132,0.138895,0.100751,0.031529
11,0.245691,0.334386,0.358692,0.428225,0.345764,1.0,0.31114,0.324717,0.337001,0.529989,...,0.081979,0.050823,0.075507,0.052342,0.143986,0.071493,0.067455,0.069213,0.055717,0.0
16,0.240085,0.216116,0.500971,0.140414,0.269904,0.31114,1.0,0.302576,0.395251,0.255459,...,0.225522,0.203236,0.345011,0.217168,0.276929,0.265345,0.27485,0.273005,0.280906,0.191939
17,0.309381,0.144009,0.277037,0.293515,0.292537,0.324717,0.302576,1.0,0.237369,0.407341,...,0.047536,0.049013,0.079227,0.053447,0.05034,0.05261,0.058547,0.043096,0.031176,0.006769
19,0.383177,0.344587,0.281046,0.253973,0.352139,0.337001,0.395251,0.237369,1.0,0.341195,...,0.203139,0.171669,0.171437,0.238604,0.241137,0.240706,0.198263,0.276363,0.295057,0.239871
21,0.314843,0.219876,0.438093,0.348902,0.219544,0.529989,0.255459,0.407341,0.341195,1.0,...,0.036531,0.030569,0.022431,0.016667,0.028656,0.055634,0.016432,0.023417,0.020801,0.024275


### TOP SIMILAR MOVIES


In [155]:
def topSimilarMovies(movie_id, number_of_similar_movies):

    # check if the movie_id exists in the selected movies
    if movie_id not in movies_cosine_similarity_df.columns:
        return 'Movie not found in the database'
    else:
        # get the top similar movies
        top_movies_idx = movies_cosine_similarity_df[movie_id].sort_values(ascending=False).head(number_of_similar_movies + 1)

        # get the titles of the top similar movies
        top_movies = []
        for item in top_movies_idx.index:
            movie = movies_ratings[movies_ratings['movieId'] == item]['title'].values[0]
            top_movies.append(movie)

        # remove the first movie as it is the same movie
        top_movies.pop(0)
        return top_movies

In [156]:
# get the move titles of movie_id = 91529
my_movie_id = 91529
testing_movie = movies_ratings[movies_ratings['movieId'] == my_movie_id]['title'].tolist()[0]
num_of_movies = 10

# print the top similar moviesA
print(f'The  selected movie is {testing_movie} and the top {num_of_movies} similar movies are: ')
for item in topSimilarMovies(my_movie_id, num_of_movies):
    print(item)

The  selected movie is Dark Knight Rises, The (2012) and the top 10 similar movies are: 
Dark Knight, The (2008)
Up (2009)
Inception (2010)
Django Unchained (2012)
WALL·E (2008)
Inglourious Basterds (2009)
Interstellar (2014)
Iron Man (2008)
Batman Begins (2005)
Hangover, The (2009)


### RECOMMENDATION SYSTEM BASED ON USER

In [157]:
def recommendMovieToUser(userId, number_of_similar_movies):

    if userId not in user_cosine_similarity_df.index:
        return 'User not found in the database'

    # get the movies that the user has watched
    user_movies = movies_ratings[movies_ratings['userId'] == userId]['movieId'].tolist()

    # get the similarity scores between the user and all other users
    user_similarity = user_cosine_similarity_df[userId].sort_values(ascending=False)

    # get the top 10 similar users
    top_similar_users = user_similarity.head(10).index.tolist()

    # get the movies that the top 10 similar users have watched
    top_similar_users_movies = movies_ratings[movies_ratings['userId'].isin(top_similar_users)]['movieId'].tolist()

    # get the movies that the top 10 similar users have watched but the user hasn't
    recommended_movies = list(set(top_similar_users_movies) - set(user_movies))

    # get the titles of the recommended movies
    recommended_movies_titles = []
    for movie_id in recommended_movies:
        movie_title = movies_ratings[movies_ratings['movieId'] == movie_id]['title'].tolist()[0]
        recommended_movies_titles.append(movie_title)

    # return the top 5 recommended movies
    return recommended_movies_titles[:number_of_similar_movies]

In [158]:
# get the top 10 recommended movies for user_id = 200
user_id = 200
num_of_movies = 5
print(f'The top {num_of_movies} recommended movies for user_id = {user_id} are: ')
for item in recommendMovieToUser(user_id, num_of_movies):
    print(item)

The top 5 recommended movies for user_id = 200 are: 
Jumanji (1995)
Hangover, The (2009)
Heat (1995)
Sabrina (1995)
Robin Hood: Men in Tights (1993)
