# MOVIE RECOMMENDATION PROJECT

# INTRODUCTION



# OBJECTIVE

# DATA PREPARATION

In [234]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [235]:
df = pd.read_csv('C:/Users/emeka/DATASETS/netflix_titles.csv')

In [236]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [237]:

df['listed_in'] = df['listed_in'].str.replace(' ', '')
#get series of lists by by applying split operation
df['listed_in'] = df['listed_in'].str.split(',')



# Getting distinct listed_in types for generating columns of genre type.
genre_columns = list(set([j for i in df['listed_in'].tolist() for j in i]))

# Iterating over every list to create and fill values into columns.
for j in genre_columns:
    df[j] = 0
for i in range(df.shape[0]):
    for j in genre_columns:
        if(j in df['listed_in'].iloc[i]):
            df.loc[i,j] = 1

df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,...,AnimeFeatures,Action&Adventure,IndependentMovies,InternationalTVShows,TVSci-Fi&Fantasy,AnimeSeries,Documentaries,Dramas,ClassicMovies,Faith&Spirituality
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,...,0,0,0,0,0,0,1,0,0,0
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,...,0,0,0,1,0,0,0,0,0,0
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,...,0,0,0,1,0,0,0,0,0,0
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,...,0,0,0,0,0,0,0,0,0,0
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,...,0,0,0,1,0,0,0,0,0,0


the dataset was made up of 12 columns and 8807 rows. After extracting distinct movie genres and dropping the listed_in column, we now have 53 columns.

In [238]:
# dropping 'listed_in' columns as it has already been one hot encoded.
df.drop('listed_in',axis=1,inplace=True)

In [239]:
# # Instantiate the vectorizer object to the vectorizer variable and remove stop words
vectorizer = TfidfVectorizer(stop_words='english')

In [240]:
#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

In [241]:
# Fit and transform the description column
vectorized_data = vectorizer.fit_transform(df['description'])

In [242]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(vectorized_data, vectorized_data)

In [243]:
indices = pd.Series(df.index, index=df['title'])

In [217]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with the chosen movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [244]:
get_recommendations('Our Godfather')

29                         Paranoia
630             Killing Them Softly
208     Once Upon a Time in Mumbaai
2412                       365 Days
3452                 Peaky Blinders
8272                   The Departed
841          Gatao - The Last Stray
5752                       Spotless
4182                           Soni
1263                      No Escape
Name: title, dtype: object

# Content Filtering based on other attributes

Other Movie attributesb such as title, cast, listedin, description and director will be used to calculate its cosine similarity with another movie.

Firstly the data has to be pre-processed using NLP to obtain only one column that contains all the attributes (in words) of each movie. After that, this information is converted into numbers by vectorization, where scores are assigned to each word. Subsequently cosine similarities can be calculated.

In [219]:
dataframe = pd.read_csv('C:/Users/emeka/DATASETS/netflix_titles.csv')

In [220]:
#fill all missing values with empty string
df_1 = dataframe.fillna('')


In [221]:
#create a function to convert all identified columns to lower case and replace empty spaces with empty strings
def clean_data(x):
        return str.lower(x.replace(" ", ""))

In [222]:
attributes = ['title', 'director', 'cast', 'listed_in', 'description']

In [223]:
df_1 = df_1[attributes]
df_1

Unnamed: 0,title,director,cast,listed_in,description
0,Dick Johnson Is Dead,Kirsten Johnson,,Documentaries,"As her father nears the end of his life, filmm..."
1,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...","International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...","Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,Jailbirds New Orleans,,,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...","International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...
8802,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...","Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,Zombie Dumb,,,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...","Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...","Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [224]:
for attribute in attributes:
    df_1[attribute] = df_1[attribute].apply(clean_data)


In [225]:
def create_soup(x):
    return x['director'] + ' ' + x['cast'] + ' ' +x['listed_in']+' '+ x['description']

In [226]:
df_1['soup'] = df_1.apply(create_soup, axis=1)

In [227]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_1['soup'])

In [228]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [229]:
df_1=df_1.reset_index()
indices = pd.Series(df_1.index, index=df_1['title'])

In [230]:
def get_recommendations_new(title, cosine_sim=cosine_sim):
    title=title.replace(' ','').lower()
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df_1['title'].iloc[movie_indices]

In [231]:
get_recommendations_new('PK', cosine_sim2)

1114                       3idiots
8391      thelegendofmichaelmishra
6907                        haapus
4790               anthonykaunhai?
1022                taarezameenpar
4507                         sanju
6439                 chaldharpakad
195     emi:liyahaitochukanapadega
2720                       dostana
4427                 chancepedance
Name: title, dtype: object