# 04 - Making a Movie Recommandation Engine using Collaboration-based Filtering

#### Amin Khoeini

***

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

In [2]:
review = pd.read_csv('/Users/Amin/Documents/GitHub/Review-Sentiment-Analysis-with-Recommendation-System/data/process_db.csv')

### Create a movie-movie based collaboration filter

In [7]:
# Delete any duplicate for each user. If user review a movie more than once, only keep the last review
review_db = review.drop_duplicates(subset=['User_ID', 'movie'], keep='last')
    
# Make a pivot table of the User and movie and set the rating as a score
user_ratings_table = review_db.pivot(index='User_ID', columns='movie', values='rating')
    
# Because not all the user review all the movie, the table need to be normalized and then fill the NA with 0
avg_ratings = user_ratings_table.mean(axis=1)

user_ratings_table_centered = user_ratings_table.sub(avg_ratings, axis=0)

user_ratings_table_normed = user_ratings_table_centered.fillna(0)

# For movie-movie based recommender, the similarity matrix between movies has to be created, therefore the Table need to be transpose.
movie_ratings_centered = user_ratings_table_normed.T

# Generate the similarity matrix
similarities = cosine_similarity(movie_ratings_centered)

# Wrap the similarities in a DataFrame
cosine_similarity_df = pd.DataFrame(similarities, index=movie_ratings_centered.index, columns=movie_ratings_centered.index)

In [10]:
cosine_similarity_df.head()

movie,'71,10 Cloverfield Lane,10 Things I Hate About You,10 to Midnight,"10,000 BC",11:14,12 Angry Men,12 Rounds,12 Years a Slave,127 Hours,...,Zombie Holocaust,Zombieland,Zookeeper,Zoolander,Zoolander 2,Zootopia,Zulu,eXistenZ,xXx,xXx: State of the Union
movie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71,1.0,0.027435,0.006637,0.002536,-0.034801,0.023863,0.027004,0.000796,0.040356,-0.009749,...,-0.000248,-0.003792,0.06393,-0.031174,-0.033842,-0.003267,0.039651,0.003959,-0.010022,-0.035523
10 Cloverfield Lane,0.027435,1.0,-0.01885,0.03879,-0.00462,-0.002397,0.015046,0.105461,0.068392,0.095517,...,-0.020941,0.038561,-0.0408,0.014036,-0.165862,0.224455,0.007429,-0.000169,-0.053879,0.028403
10 Things I Hate About You,0.006637,-0.01885,1.0,-0.004741,-0.019572,0.012431,0.001817,0.017871,-0.010542,-0.039761,...,-0.006155,-0.019597,0.001586,0.015207,0.008581,-0.026059,0.004373,-0.013133,0.030601,0.009236
10 to Midnight,0.002536,0.03879,-0.004741,1.0,0.045483,-0.006371,-0.042825,0.03833,-0.027867,-0.002168,...,0.033822,-0.039527,0.01193,-0.009751,-0.001438,-0.038987,-0.06972,-0.028081,0.007123,0.006028
"10,000 BC",-0.034801,-0.00462,-0.019572,0.045483,1.0,-0.053256,-0.052687,0.059664,-0.059677,-0.047247,...,-0.012842,-0.064621,0.052611,-0.04674,0.038984,-0.033714,-0.059481,-0.018592,0.057841,0.080234


In [8]:
def recommendation_movie(movie_title):
    
    # Selecting the target movie similarity matrix
    cosine_similarity_series = cosine_similarity_df.loc[movie_title]

    # Sort these values highest to lowest and pick the first 30 movie.
    ordered_similarities = cosine_similarity_series.sort_values(ascending=False)[1:31]

    return(ordered_similarities.index.tolist())

In [11]:
recommendation_movie('10 to Midnight ')

['Attack of the Crab Monsters ',
 'Death Wish 4: The Crackdown ',
 'Shock Waves ',
 'Kingdom of the Spiders ',
 'Invasion U.S.A. ',
 'Airport 1975 ',
 'King of the Zombies ',
 'Attack of the Giant Leeches ',
 'The Screaming Skull ',
 'Beyond the Valley of the Dolls ',
 'Night of the Lepus ',
 'Blood Feast ',
 'Mike and Dave Need Wedding Dates ',
 'Killers from Space ',
 "The Brain That Wouldn't Die ",
 'The Driller Killer ',
 'Robot Monster ',
 'Attack of the Killer Tomatoes! ',
 'Creature from the Haunted Sea ',
 'A Bucket of Blood ',
 'The Beastmaster ',
 'Xanadu ',
 'The Giant Gila Monster ',
 'Billy Jack ',
 'Head ',
 'The Beast of Yucca Flats ',
 'The Black Hole ',
 'Happy Birthday to Me ',
 "Coogan's Bluff ",
 'The Green Berets ']

### Create a user-user based collaboration filter

In [4]:
def recommendation_user(user_id):
    
    # Delete any duplicate for each user. If user review a movie more than once, only keep the last review
    review_db = review.drop_duplicates(subset=['User_ID', 'movie'], keep='last')
    
    # Make a pivot table of the User and movie and set the rating as a score
    user_ratings_table = review_db.pivot(index='User_ID', columns='movie', values='rating')
    
    # Because not all the user review all the movie, the table need to be normalized and then fill the NA with 0
    avg_ratings = user_ratings_table.mean(axis=1)

    user_ratings_table_centered = user_ratings_table.sub(avg_ratings, axis=0)

    user_ratings_table_normed = user_ratings_table_centered.fillna(0)
    
    # Generate the similarity matrix
    similarities_user = cosine_similarity(user_ratings_table_normed)

    # Wrap the similarities in a DataFrame
    user_cosine_similarity_df = pd.DataFrame(similarities_user, index=user_ratings_table_normed.index, columns=user_ratings_table_normed.index)
    
    user_cosine_similarity_series = user_cosine_similarity_df.loc[user_id]

    # Sort these values highest to lowest
    similar_users = user_cosine_similarity_series.sort_values(ascending=False)[1:51]
    similar_movie_df = user_ratings_table_normed[user_ratings_table_normed.index.isin(similar_users.index)]


    item_score = {}
    for i in similar_movie_df.columns:
      # Get the ratings for movie i
      movie_rating = similar_movie_df[i]
      # Create a variable to store the score
      total = 0
      # Create a variable to store the number of scores
      count = 0
      # Loop through similar users
      for u in similar_users.index:
        # If the movie has rating
        if pd.isna(movie_rating[u]) == False:
          # Score is the sum of user similarity score multiply by the movie rating
          score = similar_users[u] * movie_rating[u]
          # Add the score to the total score for the movie so far
          total += score
          # Add 1 to the count
          count +=1
      # Get the average score for the item
      item_score[i] = total / count
    # Convert dictionary to pandas dataframe
    item_score = pd.DataFrame(item_score.items(), columns=['movie', 'movie_score'])

    # Sort the movies by score
    ranked_item_score = item_score.sort_values(by='movie_score', ascending=False)
    # Select top m movies
    
    return(ranked_item_score.movie[0:31].tolist())

### Combine the result of two filter togeter:

In [25]:
def recommender(user_id,movie_title):
    # chose the similar movie form the user-user and movie-movie recommended list as a final recommendation
    final = list(set(recommendation_movie(movie_title)).intersection(set(recommendation_user(user_id))))
    
    if final :
        return final 
    # If there is not enough review from users the user-user filter recommender return empty list
    # In that case model return only re movie-movie list ( or add movie-movie to user-user in case there is any movie in that list)
    return ((recommendation_movie(movie_title)) + list(recommendation_user(user_id)))[0:6]


In [24]:
recommender(2343,'Scary Movie 3 ')

['Scary Movie 4 ',
 'Sleepaway Camp III: Teenage Wasteland ',
 'Gothika ',
 'Scary Movie 2 ',
 'Paranormal Activity 3 ',
 'The Texas Chainsaw Massacre ']