# Recommendation using Count Vectorization

In [2]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [29]:
# Importing the necessary libraries

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv('../Data/Metadata.csv')

In [5]:
df.shape

(19991, 14)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19991 entries, 0 to 19990
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 19991 non-null  int64  
 1   title              19991 non-null  object 
 2   genres             19991 non-null  object 
 3   original_language  19991 non-null  object 
 4   overview           19991 non-null  object 
 5   popularity         19991 non-null  float64
 6   release_date       19991 non-null  object 
 7   vote_average       19991 non-null  float64
 8   vote_count         19991 non-null  float64
 9   keywords           19991 non-null  object 
 10  year               19991 non-null  float64
 11  cast               19991 non-null  object 
 12  director           19991 non-null  object 
 13  score              19991 non-null  float64
dtypes: float64(5), int64(1), object(8)
memory usage: 2.1+ MB


In [7]:
df.sort_values('score', ascending=False, inplace=True)

In [8]:
df.columns

Index(['id', 'title', 'genres', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count', 'keywords', 'year',
       'cast', 'director', 'score'],
      dtype='object')

In [9]:
temp_df = df[['id','title','genres','director','cast','keywords']]

In [10]:
#converting the names and keyword instances into lowercase and strip all the spaces between them

temp_df.loc[:, 'cast'] = temp_df['cast'].str.lower()
temp_df.loc[:, 'genres'] = temp_df['genres'].str.lower().str.replace('-', '')
temp_df.loc[:, 'director'] = temp_df['director'].str.lower().str.replace(' ', '')

In [11]:
#Combine title, synopsis, and Genre
temp_df['soup'] = temp_df.apply(lambda row: f"Genres: {row['genres']}. Keywords: {row['keywords']}. Cast: {row['cast']}. Director: {row['director']}.", axis=1)
temp_df['soup'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['soup'] = temp_df.apply(lambda row: f"Genres: {row['genres']}. Keywords: {row['keywords']}. Cast: {row['cast']}. Director: {row['director']}.", axis=1)


"Genres: crime comedy. Keywords: hotel newyear'seve witch bet hotelroom sperm anthology losangelescalifornia hoodlum multiplestorylines womandirector. Cast: timroth jenniferbeals antoniobanderas. Director: allisonanders\nalexandrerockwell\nrobertrodriguez\nquentintarantino."

In [12]:
temp_df['soup'][temp_df['id'] == 862].values[0]

'Genres: animation adventure family comedy. Keywords: martialarts jealousy friendship bullying elementaryschool friends rivalry rescue mission buddy walkietalkie toycar boynextdoor newtoy neighborhood toycomestolife resourcefulness. Cast: tomhanks timallen donrickles. Director: johnlasseter.'

In [13]:
temp_df.drop(columns=['title', 'director', 'cast', 'genres', 'keywords'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df.drop(columns=['title', 'director', 'cast', 'genres', 'keywords'], inplace=True)


In [14]:
final_df = pd.merge(df,temp_df, left_on=['id'], right_on=['id'], how='left')

In [15]:
final_df.head()

Unnamed: 0,id,title,genres,original_language,overview,popularity,release_date,vote_average,vote_count,keywords,year,cast,director,score,soup
0,278,The Shawshank Redemption,Drama Crime,English,Framed in the 1940s for the double murder of h...,98.11,1994-09-23,8.702,23389.0,prison corruption policebrutality basedonnovel...,1994.0,TimRobbins MorganFreeman BobGunton,Frank Darabont,8.646118,Genres: drama crime. Keywords: prison corrupti...
1,238,The Godfather,Drama Crime,English,Spanning the years 1945 to 1955 a chronicle of...,170.99,1972-03-14,8.695,19845.0,basedonnovelorbook lossoflovedone loveatfirsts...,1972.0,MarlonBrando AlPacino JamesCaan,Francis Ford Coppola,8.629637,Genres: drama crime. Keywords: basedonnovelorb...
2,424,Schindler's List,Drama History War,English,The true story of how businessman Oskar Schind...,64.841,1993-12-15,8.572,14388.0,basedonnovelorbook factory concentrationcamp h...,1993.0,LiamNeeson BenKingsley RalphFiennes,Steven Spielberg,8.487763,Genres: drama history war. Keywords: basedonno...
3,240,The Godfather Part II,Drama Crime,English,In the continuing saga of the Corleone crime f...,62.665,1974-12-20,8.598,10630.0,italy italianamerican cuba symbolism gangster ...,1974.0,AlPacino RobertDuvall DianeKeaton,Francis Ford Coppola,8.484184,Genres: drama crime. Keywords: italy italianam...
4,155,The Dark Knight,Drama Action Crime Thriller,English,Batman raises the stakes in his war on crime. ...,126.226,2008-07-16,8.513,31145.0,joker sadism chaos secretidentity crimefighter...,2008.0,ChristianBale HeathLedger MichaelCaine,Christopher Nolan,8.474344,Genres: drama action crime thriller. Keywords:...


### Getting vectors from CountVectorizor

In [16]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(final_df['soup'])

In [17]:
count_matrix.shape

(19991, 51324)

In [18]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [19]:
indices = pd.Series(final_df.index, index=final_df['title'])

In [20]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim = cosine_sim):

    if title not in indices:
        raise ValueError(f"Title '{title}' not found in indices")
    
    # Get the index of the movie that matches the title
    idx = indices[title]  

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))   

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) 

    # Get the scores of the most similar movies
    sim_scores = sim_scores[1:10 + 1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]  

    # Getting the weighted ratings and popularity of the movies
    movies = final_df.iloc[movie_indices][['title', 'year', 'score', 'original_language', 'popularity']]

    # Sort movies first by 'Score' in descending order, then by 'popularity' in descending order
    movies = movies.sort_values(by=['score', 'popularity'], ascending=[False, False])

    return movies

## Testing the setup

In [21]:
get_recommendations('Interstellar')

Unnamed: 0,title,year,score,original_language,popularity
270,The Martian,2015.0,7.660003,English,48.685
459,Moon,2009.0,7.457344,English,18.077
9524,Elysium,2013.0,6.489363,English,32.843
12620,Marooned,1969.0,6.419507,English,7.631
12901,Destination Moon,1950.0,6.413027,English,7.952
13221,JUNG_E,2023.0,6.404144,Korean,711.941
16085,Capsule,2015.0,6.310491,English,6.265
16948,Stowaway,2021.0,6.269068,English,15.621
17051,Love,2011.0,6.263009,English,9.394
18195,Lucy in the Sky,2019.0,6.172745,English,10.44


In [22]:
get_recommendations('Inception')

Unnamed: 0,title,year,score,original_language,popularity
6500,Terror of Mechagodzilla,1975.0,6.563182,Japanese,10.546
12326,Brainstorm,1983.0,6.426445,English,9.253
12349,Buck Rogers in the 25th Century,1979.0,6.425872,English,5.001
14646,Invasion: Planet Earth,2019.0,6.362298,English,6.137
14655,Star Wars: The Rise of Skywalker,2019.0,6.361936,English,62.836
16028,Moontrap: Target Earth,2017.0,6.313594,English,33.402
17321,The Call Up,2016.0,6.246159,English,8.993
18006,Transmorphers,2007.0,6.193359,English,4.498
18381,Journey 2: The Mysterious Island,2012.0,6.15055,English,76.422
18883,Max Steel,2016.0,6.062716,English,22.236


In [23]:
get_recommendations('The Dark Knight Rises')

Unnamed: 0,title,year,score,original_language,popularity
4,The Dark Knight,2008.0,8.474344,English,126.226
262,Batman Begins,2005.0,7.666825,English,93.893
1605,Batman: Assault on Arkham,2014.0,7.001524,English,14.829
5827,The Last Days of American Crime,2020.0,6.585388,English,14.945
6606,City on Fire,1987.0,6.560384,Cantonese,10.057
8073,Across 110th Street,1972.0,6.521387,English,7.078
8991,NH10,2015.0,6.500675,Hindi,4.279
11810,Defendor,2009.0,6.437744,English,9.129
13233,Kick-Ass 2,2013.0,6.403846,English,31.942
16183,The Courier,2019.0,6.306215,English,22.071


In [24]:
get_recommendations('Hulk')

Unnamed: 0,title,year,score,original_language,popularity
1519,Injustice,2021.0,7.024843,English,45.986
1749,X-Men,2000.0,6.963155,English,1.423
6953,Hulk vs. Thor,2009.0,6.550188,English,12.939
9475,Hulk: Where Monsters Dwell,2016.0,6.490451,English,8.846
11214,Krrish,2006.0,6.451521,Hindi,7.992
12328,Avengers Confidential: Black Widow & Punisher,2014.0,6.426286,Japanese,37.412
12329,Superman: Brainiac Attacks,2006.0,6.426267,English,9.532
17716,The Incredible Hulk,2008.0,6.218239,English,59.44
19545,Steel,1997.0,5.867048,English,12.966
19988,Batman & Robin,1997.0,4.544024,English,36.578


In [25]:
get_recommendations('Your Name')

Unnamed: 0,title,year,score,original_language,popularity
2248,The Anthem of the Heart,2015.0,6.863976,Japanese,25.317
2377,Children Who Chase Lost Voices,2011.0,6.843095,Japanese,24.701
4397,I've Always Liked You,2016.0,6.648975,Japanese,21.744
4429,The Moment You Fall in Love,2016.0,6.647458,Japanese,22.387
4861,Until Forever,2016.0,6.625323,English,22.492
6114,Orange: Future,2016.0,6.575597,Japanese,5.132
6891,Her Blue Sky,2019.0,6.551745,Japanese,20.975
7231,"Love, Chunibyo & Other Delusions! Rikka Version",2013.0,6.54229,Japanese,15.995
7479,The Lost Husband,2020.0,6.535928,English,8.044
10982,Midori,1992.0,6.456913,Japanese,19.336


In [26]:
get_recommendations('The Godfather')

Unnamed: 0,title,year,score,original_language,popularity
3,The Godfather Part II,1974.0,8.484184,English,62.665
425,The Irishman,2019.0,7.495517,English,27.739
703,The Godfather Part III,1990.0,7.306118,English,67.934
1910,The Godfather Trilogy: 1901-1980,1992.0,6.927905,English,11.684
3349,Shoot the Piano Player,1960.0,6.728088,French,9.386
4661,Piranhas,2019.0,6.635774,Italian,10.606
5289,Salvatore Giuliano,1962.0,6.605344,Italian,5.686
5808,Gangubai Kathiawadi,2022.0,6.585808,Hindi,10.512
6368,The Banishment,2008.0,6.567257,Russian,5.719
8812,The Gambler,1974.0,6.504228,English,8.158


In [27]:
get_recommendations('Suzume')

Unnamed: 0,title,year,score,original_language,popularity
2377,Children Who Chase Lost Voices,2011.0,6.843095,Japanese,24.701
2684,One Piece: Strong World,2009.0,6.804882,Japanese,31.282
3062,Naruto Shippuden the Movie: The Will of Fire,2009.0,6.758979,Japanese,47.101
3131,Drifting Home,2022.0,6.751755,Japanese,126.618
3950,Inuyasha the Movie 2: The Castle Beyond the Lo...,2002.0,6.677386,Japanese,26.874
4922,Inuyasha the Movie 4: Fire on the Mystic Island,2004.0,6.622518,Japanese,28.915
8352,Pokémon the Movie: White - Victini and Zekrom,2011.0,6.514447,Japanese,13.737
8482,Naruto the Movie: Legend of the Stone of Gelel,2005.0,6.511735,Japanese,34.361
8528,Pokémon the Movie: Black - Victini and Reshiram,2011.0,6.510666,Japanese,14.226
8696,Modest Heroes,2018.0,6.506988,Japanese,25.804


Now we have implemented the Count Vectorizer on the Metadata.csv, in the next we will dealing xgboost for recommandation