### <font color='orange'>Impoting libraries and datasets</font>

In [2]:
from math import sqrt
import pandas as pd

movies_df = pd.read_csv('movie.csv')
ratings_df = pd.read_csv('rating.csv')

In [3]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


### <font color='orange'>Data Pre-processing</font>

##### By looking at movies_df we find out that title and year are in one column. So we need to separate them ! In addition, genres need to be separated into a list.

In [5]:
# Using regular expressions to find a year stored between parentheses

# We specify the parantheses so we don't conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))', expand=False)

# Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)', expand=False)

# Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '', regex=True)

# Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df.title.apply(lambda x: x.strip())

# Deleting | from genras columns
movies_df.genres = movies_df.genres.str.split('|')

movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


##### In order to implement recommendation systems, we need to use one-hot encoding. so we add m columns (m = number of unique genras) to the dataset. For each movie we assign 0 if it doesnt have the genra and we assign 1 if it does have the genra.

In [6]:
def one_hot_encoding(df: pd.DataFrame):
    movies_with_genres_df = df.copy()

    for index, row in df.iterrows():
        for genre in row.genres:
            movies_with_genres_df.at[index, genre] = 1

    # Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
    movies_with_genres_df.fillna(0, inplace=True)

    return movies_with_genres_df

movies_with_genres_df = one_hot_encoding(movies_df)
movies_with_genres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


##### By looking at the rating_csv we can see timestamp column which is not neccessary to hold. So we drop it.  

In [7]:
# Droping timestamp column
ratings_df.drop('timestamp', axis='columns', inplace=True)

##### Taking user id to give recommendation to

In [8]:
user_id_recom = int(input('User id to give recommendation to = '))

query_user = ratings_df[ratings_df['userId'] == user_id_recom]

query_names = movies_df[movies_df.movieId.isin(query_user.movieId.tolist())]

query_movies = pd.merge(query_user, query_names)

query_movies.drop(['genres', 'year', 'userId'], axis='columns', inplace=True)
query_movies.head()

User id to give recommendation to = 1


Unnamed: 0,movieId,rating,title
0,2,3.5,Jumanji
1,29,3.5,"City of Lost Children, The (Cité des enfants p..."
2,32,3.5,Twelve Monkeys (a.k.a. 12 Monkeys)
3,47,3.5,Seven (a.k.a. Se7en)
4,50,3.5,"Usual Suspects, The"


### <font color='orange'>Content-based recommendation system</font>

In [9]:
# Filtering out the movies from movies_with_genres_df
user_movies = movies_with_genres_df[movies_with_genres_df['movieId'].isin(
    query_movies['movieId'].tolist()
)]

# Resetting the index to avoid future issues
user_movies = user_movies.reset_index(drop=True)

# Dropping unnecessary issues due to memory and to avoid issues
user_genre_df = user_movies.drop(['movieId', 'title', 'genres', 'year'], axis='columns')

user_genre_df.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Turn each genre into weights by using dot product between a matrix and a vector
user_profile = user_genre_df.T.dot(query_movies['rating'])
user_profile.head(len(user_profile))

Adventure             276.5
Animation              36.5
Children               68.5
Comedy                153.0
Fantasy               261.5
Romance                43.5
Drama                 162.0
Action                246.0
Crime                  80.0
Thriller              158.0
Horror                168.5
Mystery                65.0
Sci-Fi                148.5
IMAX                    8.5
Documentary             0.0
War                    33.0
Musical                11.0
Western                13.5
Film-Noir               0.0
(no genres listed)      0.0
dtype: float64

##### Knowing user profile (weights for every genre of the user's preferences), we can recommend movies that satisfy the user's preferences.

In [11]:
# Now let's get the genres of every movie in our original dataframe
genre_df = movies_with_genres_df.set_index(movies_with_genres_df['movieId'])

# Droping the unnecessary information
genre_df.drop(['movieId', 'title', 'genres', 'year'], axis='columns', inplace=True)

# Getting recommendation by multiplying each row in genre_df with the user_profile and summing the row 
# to get wieght to recommend the movies
recommendation_df = ((genre_df * user_profile).sum(axis='columns')) / user_profile.sum()

# Sort it by descending order
recommendation_df = recommendation_df.sort_values(ascending=False)

# Getting only top 3 movies to recommend to user
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(3).keys())]

Unnamed: 0,movieId,title,genres,year
4922,5018,Motorama,"[Adventure, Comedy, Crime, Drama, Fantasy, Mys...",1991
11487,49593,She,"[Action, Adventure, Drama, Fantasy, Horror, Ro...",1965
16024,81132,Rubber,"[Action, Adventure, Comedy, Crime, Drama, Film...",2010


### <font color='orange'>Collaborative filtering recommendation system</font>

In [13]:
# With the movie ID's in our query, we can now get the subset of users that have watched and reviewed 
# the movies in our query.
user_subset = ratings_df[ratings_df['movieId'].isin(query_movies['movieId'].tolist())]

# We now group up the rows by user ID.
user_subset_group = user_subset.groupby(['userId'])

# Sorting users with movie most in common with the input will have priority
user_subset_group = sorted(user_subset_group, key=lambda x: len(x[1]), reverse=True)

# This limit is imposed because we don't want to waste too much time going through every single user
user_subset_group = user_subset_group[0:100]

##### Calculate the Pearson Correlation between input user and subset group, and store it in a dictionary

In [14]:
def calculate_persona_corr(user_subset_group, input_movies):
    # Store the Pearson Correlation in a dictionary, where the key is the user Id and the
    # value is the coefficient
    pearson_corr_dict = {}

    for name, group in user_subset_group:
        
        # Let's start by sorting the input and current user group so the values aren't mixed up later on
        group = group.sort_values(by = 'movieId')
        input_movies = input_movies.sort_values(by = 'movieId')

        # Get the N for the formula
        n_ratings = len(group)

        # Get the review scores for the movies that they both have in common
        temp_df = input_movies[input_movies['movieId'].isin(group['movieId'].tolist())]

        # And then store them in a temporary buffer variable in a list format to facilitate future calculations
        temp_rating_list = temp_df['rating'].tolist()

        # Let's also put the current user group reviews in a list format
        temp_group_list = group['rating'].tolist()

        # Now let's calculate the pearson correlation between two users, so called, x and y
        Sxx = sum([i**2 for i in temp_rating_list]) - pow(sum(temp_rating_list), 2) / float(n_ratings)
        Syy = sum([i**2 for i in temp_group_list]) - pow(sum(temp_group_list), 2) / float(n_ratings)
        Sxy = sum(i * j for i, j in zip(temp_rating_list, temp_group_list)) - sum(temp_rating_list) * sum(temp_group_list) / float(n_ratings)

        # If the denominator is different than zero, then divide, else, 0 correlation.
        if Sxx != 0 and Syy != 0:
            pearson_corr_dict[name] = Sxy / sqrt(Sxx * Syy)
        else:
            pearson_corr_dict[name] = 0

    return pearson_corr_dict

In [20]:
pearson_corr_dict = calculate_persona_corr(user_subset_group, query_movies)

pearson_df = pd.DataFrame.from_dict(pearson_corr_dict, orient='index')
pearson_df.columns = ['similarityIndex']
pearson_df['userId'] = pearson_df.index
pearson_df.index = range(len(pearson_df))

# Now we save top 3 users that are most similar to the input.
top_users = pearson_df.sort_values(by = 'similarityIndex', ascending = False)[0:50]

# Calculating similarityIndex column using preason corrolation 
top_users_rating = top_users.merge(ratings_df, left_on = 'userId', right_on = 'userId', how = 'inner')

# Multiplies the similarity by the user's ratings
top_users_rating['weightedRating'] = top_users_rating['similarityIndex'] * top_users_rating['rating']

# Applies a sum to the top_users after grouping it up by userId
temp_top_users_rating = top_users_rating.groupby('movieId').sum()[['similarityIndex', 'weightedRating']]
temp_top_users_rating.columns = ['sum_similarityIndex', 'sum_weightedRating']

# Creates an empty dataframe
recommendation_df = pd.DataFrame()

# Now we take the weighted average
recommendation_df['weighted average recommendation score'] = temp_top_users_rating['sum_weightedRating'] / temp_top_users_rating['sum_similarityIndex']
recommendation_df['movieId'] = temp_top_users_rating.index

# Now print top 3 movies that the algorithm recommended.
recommendation_df = recommendation_df.sort_values(by = 'weighted average recommendation score', ascending = False)

movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(3)['movieId'].tolist())]

Unnamed: 0,movieId,title,genres,year
7129,7241,Kanal,"[Drama, War]",1957
22776,108979,Cowboy Bebop,"[Action, Adventure, Animation, Crime, Sci-Fi]",1998
24366,115467,Harmontown,[Documentary],2014
