# <p style="background-color: #37E1C2  ; font-family: Helvetica, fantasy; line-height: 1.3; font-size: 26px; letter-spacing: 3px; text-align: center; font-weight: bold; color: #ffffff">Quick TEXT-BASED System Recommendation: Movies dataset</p>

![](https://www.freeiconspng.com/thumbs/movie-icon/movie-icon-27.png)

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('../input/tmdb-top-10000-popular-movies-dataset/TMDb_updated.CSV', index_col=0)
print(df.shape)
df.head()

(10000, 5)


Unnamed: 0,title,overview,original_language,vote_count,vote_average
0,Ad Astra,"The near future, a time when both hope and har...",en,2853,5.9
1,Bloodshot,"After he and his wife are murdered, marine Ray...",en,1349,7.2
2,Bad Boys for Life,Marcus and Mike are forced to confront new thr...,en,2530,7.1
3,Ant-Man,Armed with the astonishing ability to shrink i...,en,13611,7.1
4,Percy Jackson: Sea of Monsters,"In their quest to confront the ultimate evil, ...",en,3542,5.9


In [3]:
df.isnull().sum()

title                 0
overview             30
original_language     0
vote_count            0
vote_average          0
dtype: int64

In [4]:
# Drop the 30 rows that contains missing values for the 'Overveiw' column
df.dropna(inplace=True)

In [5]:
# Vectorizing the 'Overview' column
tfidf = TfidfVectorizer(min_df=4, max_df=0.7)

vectorized_data = tfidf.fit_transform(df['overview'])

In [6]:
# Turn the vectorized_data into a DataFrame that with feature_names as columns and movie title as index
tfidf_df = pd.DataFrame(vectorized_data.toarray(), 
                        columns=tfidf.get_feature_names())
tfidf_df.index = df['title']
tfidf_df

Unnamed: 0_level_0,000,007,10,100,11,11th,12,12th,13,13th,...,zebra,zero,zeus,zoe,zombie,zombies,zone,zones,zoo,zooey
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ad Astra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bloodshot,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bad Boys for Life,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ant-Man,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Percy Jackson: Sea of Monsters,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cargo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Good Night,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The World Is Yours,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Grand Seduction,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Measure between items using cosine similarity
cosine_similarity_array = cosine_similarity(tfidf_df)

In [8]:
# Turn the cosine similarity array into a dataframe
cosine_sim_df = pd.DataFrame(cosine_similarity_array, index = tfidf_df.index, columns = tfidf_df.index)
cosine_sim_df

title,Ad Astra,Bloodshot,Bad Boys for Life,Ant-Man,Percy Jackson: Sea of Monsters,Birds of Prey (and the Fantabulous Emancipation of One Harley Quinn),Live Free or Die Hard,Cold Blood,Underwater,The Platform,...,Attack on Titan,Pokémon: The Rise of Darkrai,Eagle vs Shark,High Flying Bird,Zapped!,Cargo,The Good Night,The World Is Yours,The Grand Seduction,Woochi: The Demon Slayer
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ad Astra,1.000000,0.032822,0.006165,0.008447,0.006024,0.004912,0.032813,0.020560,0.050126,0.054077,...,0.020882,0.057016,0.011806,0.000000,0.008515,0.068984,0.020412,0.018521,0.014346,0.006578
Bloodshot,0.032822,1.000000,0.029857,0.047118,0.020810,0.019567,0.100522,0.120275,0.009876,0.010735,...,0.005413,0.011970,0.053973,0.093831,0.060153,0.015773,0.154289,0.051140,0.019583,0.033893
Bad Boys for Life,0.006165,0.029857,1.000000,0.048757,0.046121,0.023651,0.011685,0.000000,0.004135,0.007587,...,0.000000,0.005700,0.024760,0.000000,0.008817,0.006573,0.000000,0.036263,0.004249,0.004751
Ant-Man,0.008447,0.047118,0.048757,1.000000,0.019959,0.021598,0.015862,0.061623,0.038073,0.007773,...,0.006014,0.028940,0.013177,0.050987,0.012473,0.016969,0.038265,0.030695,0.089500,0.043655
Percy Jackson: Sea of Monsters,0.006024,0.020810,0.046121,0.019959,1.000000,0.015563,0.040686,0.019989,0.020162,0.027631,...,0.002048,0.031944,0.016348,0.018754,0.004547,0.008968,0.028091,0.018597,0.009282,0.006482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Cargo,0.068984,0.015773,0.006573,0.016969,0.008968,0.025985,0.055242,0.009983,0.046565,0.042709,...,0.024243,0.056612,0.034292,0.000000,0.018119,1.000000,0.023545,0.051968,0.006144,0.024750
The Good Night,0.020412,0.154289,0.000000,0.038265,0.028091,0.053849,0.096785,0.111569,0.026576,0.069315,...,0.013046,0.034897,0.078972,0.083816,0.067363,0.023545,1.000000,0.050389,0.022797,0.011627
The World Is Yours,0.018521,0.051140,0.036263,0.030695,0.018597,0.042879,0.035019,0.015502,0.002651,0.011379,...,0.009083,0.008549,0.009902,0.016353,0.024885,0.051968,0.050389,1.000000,0.000000,0.003046
The Grand Seduction,0.014346,0.019583,0.004249,0.089500,0.009282,0.000000,0.017725,0.025587,0.029325,0.024095,...,0.005938,0.056895,0.011419,0.022504,0.019293,0.006144,0.022797,0.000000,1.000000,0.004441


In [9]:
print(f"Let's suppose the user watched 'Bad Boys for Life', the most similar movies \
that can be recommended based on movie's overview are: {cosine_sim_df.loc['Bad Boys for Life'].sort_values(ascending=False).head()[1:].index}")

Let's suppose the user watched 'Bad Boys for Life', the most similar movies that can be recommended based on movie's overview are: Index(['Bad Boys II', 'Scarface', 'Ride Along 2', 'Bad Boys'], dtype='object', name='title')


In [10]:
cosine_sim_df.loc['Bad Boys for Life'].sort_values(ascending=False).head()

title
Bad Boys for Life    1.000000
Bad Boys II          0.329347
Scarface             0.197642
Ride Along 2         0.183544
Bad Boys             0.175233
Name: Bad Boys for Life, dtype: float64