<a href="https://colab.research.google.com/github/ALEE0001/pp_movie_recommendation/blob/main/netflix_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
# 중요: Go to Kaggle, sign up, and create your api token. Then download the json token file.
# 중요: Create directory called .kaggle in your google drive and put the json file in there.

#**Setting Up Environment and Downloading Data**

In [42]:
# Setting up Data Path

from google.colab import drive

# Mount your Google Drive.
drive.mount("/content/drive")

kaggle_creds_path = "/content/drive/MyDrive/.kaggle/kaggle.json"

! pip install kaggle --quiet

! mkdir -p ~/.kaggle
! cp /content/drive/MyDrive/.kaggle/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

! mkdir kaggle_data

# Unmount your Google Drive
drive.flush_and_unmount()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mkdir: cannot create directory ‘kaggle_data’: File exists


In [43]:
def kaggle_download(author, competition_name):

  ! kaggle datasets download -d {author + "/" + competition_name} -p kaggle_data
  ! unzip -q kaggle_data/{competition_name}.zip -d kaggle_data
  ! rm kaggle_data/{competition_name}.zip

In [44]:
kaggle_download("shivamb", "netflix-shows")
kaggle_download("ashirwadsangwan", "imdb-dataset")

Downloading netflix-shows.zip to kaggle_data
  0% 0.00/1.34M [00:00<?, ?B/s]
100% 1.34M/1.34M [00:00<00:00, 145MB/s]
replace kaggle_data/netflix_titles.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: Downloading imdb-dataset.zip to kaggle_data
 99% 1.10G/1.11G [00:11<00:00, 145MB/s]
100% 1.11G/1.11G [00:11<00:00, 107MB/s]
replace kaggle_data/name.basics.tsv/data.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

#**Read Files, Cleanse, and Join**

In [86]:
! pip install scikit-learn==1.1.2
! pip install scipy==1.9.1
! pip install missingpy



In [106]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer

import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

from missingpy import MissForest

In [5]:
df_netflix = pd.read_csv("kaggle_data/netflix_titles.csv")

df_netflix = df_netflix.rename(columns={"rating" : "age_rating"})
df_netflix["title"] = df_netflix["title"].str.lower()
df_netflix["release_year"] = df_netflix["release_year"].astype(str)
df_netflix["director"] = df_netflix["director"].str.lower()
df_netflix = df_netflix.drop_duplicates(subset=["title", "release_year", "age_rating"])

In [6]:
df_netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,age_rating,duration,listed_in,description
0,s1,Movie,dick johnson is dead,kirsten johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,blood & water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,ganglands,julien leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,jailbirds new orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,kota factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [7]:
df_temp1 = pd.read_csv("kaggle_data/title.basics.tsv/data.tsv", sep='\t')
df_temp2 = pd.read_csv("kaggle_data/title.ratings.tsv/data.tsv", sep='\t')

df_imdb = df_temp1.merge(df_temp2, on="tconst", how="inner")
del df_temp1
del df_temp2

  df_temp1 = pd.read_csv("kaggle_data/title.basics.tsv/data.tsv", sep='\t')


In [8]:
# Reading the file in chunks for memory management:
file_path = "kaggle_data/title.principals.tsv/data.tsv"

# Set the chunk size
chunk_size = 10000

# Initialize an empty list to store the dataframes for each chunk
df_temp3 = []

# Iterate through the file in chunks and append them to the list
for chunk in pd.read_csv(file_path, delimiter='\t', chunksize=chunk_size):
    chunk = chunk[["tconst", "nconst", "category"]]
    chunk = chunk[chunk["category"] == "director"]
    df_temp3.append(chunk)

# Clear from memory
del chunk

# Concatenate the list of dataframes into the final dataframe
df_temp3 = pd.concat(df_temp3, ignore_index=True)

In [9]:
df_imdb = df_imdb.merge(df_temp3, on="tconst", how="left")
del df_temp3

df_temp4 = pd.read_csv("kaggle_data/name.basics.tsv/data.tsv", sep='\t')[["nconst", "primaryName"]]
df_imdb = df_imdb.merge(df_temp4, on="nconst", how="left")
del df_temp4

# Cleansing
df_imdb["primaryTitle"] = df_imdb["primaryTitle"].str.lower()
df_imdb["primaryName"] = df_imdb["primaryName"].str.lower()
df_imdb = df_imdb \
  .drop_duplicates(subset=["tconst", "primaryTitle", "startYear"]) \
  .rename(columns={"averageRating" : "averageMovieRating"}) \
  .rename(columns={"primaryName" : "director"}) \
  [["primaryTitle", "startYear", "director", "averageMovieRating", "numVotes"]]

In [10]:
df_imdb.head()

Unnamed: 0,primaryTitle,startYear,director,averageMovieRating,numVotes
0,carmencita,1894,william k.l. dickson,5.7,1990
1,le clown et ses chiens,1892,émile reynaud,5.8,265
2,pauvre pierrot,1892,émile reynaud,6.5,1856
3,un bon bock,1892,émile reynaud,5.5,178
4,blacksmith scene,1893,william k.l. dickson,6.2,2641


In [11]:
# Find duplicate records based on title, year, director.
grouped_imdb = df_imdb[["primaryTitle", "startYear", "director", "averageMovieRating"]] \
  .groupby(["primaryTitle", "startYear", "director"],dropna=False, as_index=False) \
  .count()

multiple_records = grouped_imdb[grouped_imdb["averageMovieRating"] > 1].drop(columns="averageMovieRating")

In [12]:
# Merge df1 with df2 and identify matching rows using "_merge" column
merged_df = df_imdb.merge(multiple_records, on=["primaryTitle", "startYear", "director"], how='left', indicator=True)

# Remove duplicate records from df_imdb
df_imdb_final = merged_df[merged_df['_merge'] == 'left_only'].drop(columns='_merge')

In [13]:
# Join everything together
df_final = df_netflix \
  .merge(df_imdb_final, left_on=["title", "release_year", "director"], right_on=["primaryTitle", "startYear", "director"], how="left") \
  .drop(["primaryTitle", "startYear"], axis=1)

In [14]:
df_final.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,age_rating,duration,listed_in,description,averageMovieRating,numVotes
0,s1,Movie,dick johnson is dead,kirsten johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",,
1,s2,TV Show,blood & water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",,
2,s3,TV Show,ganglands,julien leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,,
3,s4,TV Show,jailbirds new orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",6.6,277.0
4,s5,TV Show,kota factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,,


#**Cleansing**

In [145]:
# Cleansing
df_model = df_final.copy()

In [146]:
# Convert columns to lowercase wherever possible
character_columns = df_model.select_dtypes(include='object')
df_model[character_columns.columns] = character_columns.apply(lambda x: x.str.lower())

In [147]:
# Separate duration and season by data type (tv show, movie)
df_model["duration"] = df_model["duration"].str.replace(r"\D", "", regex=True)
df_model["seasons"] = df_model["duration"].where(df_model["type"] == "tv show")
df_model.loc[df_model["type"] == "tv show", "duration"] = np.nan

In [148]:
# Create year added & month added
df_model["date_added"] = pd.to_datetime(df_model["date_added"])
df_model["year_added"] = df_model["date_added"].dt.strftime("%Y")
df_model["month_added"] = pd.to_datetime(df_model["date_added"], format='%B').dt.month
df_model = df_model.drop("date_added", axis=1)

In [149]:
# Convert appropriate columns to numeric
numeric_cols = ["release_year", "duration", "averageMovieRating", "numVotes", "seasons", "month_added", "year_added"]
df_model[numeric_cols] = df_model[numeric_cols].apply(pd.to_numeric, errors='coerce')

In [150]:
dict_age_rating = {"tv-y" : 1,      # All Age
                   "g" : 1,         # All Age
                   "tv-g": 1,       # All Age
                   "tv-y7": 2,      # 7 and up
                   "tv-y7-fv": 2,   # 7 and up
                   "pg": 3,         # Parental Guidance
                   "tv-pg": 3,      # Parental Guidance
                   "pg-13": 4,      # 13 and up
                   "tv-14": 5,      # 14 and up
                   "tv-ma": 6,      # 17 and up
                   "nc-17": 6,      # 17 and up
                   "r": 7}          # 17 and up, but separating it on my discretion

df_model["age_rating"] = df_model["age_rating"].map(dict_age_rating)

In [151]:
df_model.head()

Unnamed: 0,show_id,type,title,director,cast,country,release_year,age_rating,duration,listed_in,description,averageMovieRating,numVotes,seasons,year_added,month_added
0,s1,movie,dick johnson is dead,kirsten johnson,,united states,2020,4.0,90.0,documentaries,"as her father nears the end of his life, filmm...",,,,2021.0,9.0
1,s2,tv show,blood & water,,"ama qamata, khosi ngema, gail mabalane, thaban...",south africa,2021,6.0,,"international tv shows, tv dramas, tv mysteries","after crossing paths at a party, a cape town t...",,,2.0,2021.0,9.0
2,s3,tv show,ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",,2021,6.0,,"crime tv shows, international tv shows, tv act...",to protect his family from a powerful drug lor...,,,1.0,2021.0,9.0
3,s4,tv show,jailbirds new orleans,,,,2021,6.0,,"docuseries, reality tv","feuds, flirtations and toilet talk go down amo...",6.6,277.0,1.0,2021.0,9.0
4,s5,tv show,kota factory,,"mayur more, jitendra kumar, ranjan raj, alam k...",india,2021,6.0,,"international tv shows, romantic tv shows, tv ...",in a city of coaching centers known to train i...,,,2.0,2021.0,9.0


#**Feature Engineering - Impute**

In [152]:
# Dummify genres in prep for RF imputer, and save it to separate dataframe

strings_to_remove = ["tv", "shows", "movies", "show", "series", "features", " "]

# Define a function to remove specified strings
def remove_strings(genre):
    for remove_str in strings_to_remove:
        genre = genre.replace(remove_str, "").strip()
    return genre

# Apply the function to the "listed_in" column
df_model["listed_in"] = df_model["listed_in"].apply(lambda x: remove_strings(x))
df_model["listed_in"] = df_model["listed_in"].str.strip()

# Get unique genres after removing the specified strings
unique_genres = set(df_model["listed_in"].str.split(",").sum())

df_genres = df_model["listed_in"].str.get_dummies(",")

# OneHotEncode other categorical variables
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(df_model[["type", "country"]])
df_others = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(["type", "country"]))

df_impute = pd.concat([df_model, df_others, df_genres], axis=1)
df_impute = df_impute.drop(["type", "country", "listed_in"], axis=1)

In [153]:
df_impute.head()

Unnamed: 0,show_id,title,director,cast,release_year,age_rating,duration,description,averageMovieRating,numVotes,...,reality,romantic,sci-fi&fantasy,science&nature,spanish-language,sports,stand-upcomedy,stand-upcomedy&talk,teen,thrillers
0,s1,dick johnson is dead,kirsten johnson,,2020,4.0,90.0,"as her father nears the end of his life, filmm...",,,...,0,0,0,0,0,0,0,0,0,0
1,s2,blood & water,,"ama qamata, khosi ngema, gail mabalane, thaban...",2021,6.0,,"after crossing paths at a party, a cape town t...",,,...,0,0,0,0,0,0,0,0,0,0
2,s3,ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",2021,6.0,,to protect his family from a powerful drug lor...,,,...,0,0,0,0,0,0,0,0,0,0
3,s4,jailbirds new orleans,,,2021,6.0,,"feuds, flirtations and toilet talk go down amo...",6.6,277.0,...,1,0,0,0,0,0,0,0,0,0
4,s5,kota factory,,"mayur more, jitendra kumar, ranjan raj, alam k...",2021,6.0,,in a city of coaching centers known to train i...,,,...,0,1,0,0,0,0,0,0,0,0


In [156]:
l_char_cols_names = df_impute.select_dtypes(include='object').columns.tolist()
i_char_cols = [index for index, dtypes in enumerate(df_impute.dtypes) if dtypes == "object"]

In [158]:
print(l_char_cols_names, i_char_cols)

['show_id', 'title', 'director', 'cast', 'description'] [0, 1, 2, 3, 7]


In [159]:
df_impute = df_impute.drop(l_char_cols_names, axis=1)

In [162]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

# Random Forest Regression to Impute Missing Values
imputer = MissForest(max_iter=10, criterion=["squared_error"], min_impurity_decrease=0.0001)

imputed_data = imputer.fit_transform(df_impute)
df_imputed = pd.DataFrame(imputed_data, columns=df_impute.columns)

# Renable warnings
warnings.resetwarnings()

Iteration: 0
Iteration: 1
Iteration: 2


In [285]:
# Replace original variables with imputed variables
df_model_final = df_model.copy()
df_model_final = df_model_final.drop(["averageMovieRating", "numVotes"], axis=1)
df_model_final = pd.concat([df_model_final, df_imputed[["averageMovieRating", "numVotes"]]], axis=1)

In [286]:
# Create binned movie rating
df_model_final["bin_rating"] = df_model_final["averageMovieRating"]
kbin = KBinsDiscretizer(n_bins=10, encode="ordinal", strategy="uniform")
df_model_final["bin_rating"] = kbin.fit_transform(df_model_final[["bin_rating"]])

  and should_run_async(code)


In [287]:
# Reformat to one decimal point
df_model_final["averageMovieRating"] = df_model_final["averageMovieRating"].apply(lambda x: round(x, 1))

# Reformat to unit point
format_cols = ["release_year", "age_rating", "duration", "seasons", "year_added", "month_added", "bin_rating"]
df_model_final[format_cols] = df_model_final[format_cols].applymap(lambda x: int(x) if not pd.isnull(x) else None)

In [288]:
# Final data
df_model_final.sort_values(by="averageMovieRating", ascending=False).head()

Unnamed: 0,show_id,type,title,director,cast,country,release_year,age_rating,duration,listed_in,description,seasons,year_added,month_added,averageMovieRating,numVotes,bin_rating
7746,s7748,tv show,planet earth ii,,david attenborough,united kingdom,2016,1.0,,"british,docu,science&nature","in this sequel to the emmy-winning ""planet ear...",1.0,2017.0,12.0,9.5,152304.0,9
6341,s6342,tv show,blue planet ii,,david attenborough,united kingdom,2017,1.0,,"british,docu,science&nature",this sequel to the award-winning nature series...,1.0,2018.0,12.0,9.3,43917.0,9
3938,s3939,tv show,our planet,,david attenborough,"united states, united kingdom",2019,3.0,,"docu,science&nature",experience our planet's natural beauty and exa...,1.0,2019.0,4.0,9.3,47648.0,9
3071,s3072,tv show,reply 1988,,"lee hye-ri, park bo-gum, ryu jun-yeol, ko kyou...",south korea,2015,5.0,,"international,korean,romantic",take a nostalgic trip back to the late 1980s t...,1.0,2020.0,1.0,9.2,9222.0,9
8351,s8354,tv show,the hunt,,david attenborough,united kingdom,2015,3.0,,"british,docu,science&nature",witness some of the most riveting showdowns in...,1.0,2017.0,5.0,9.2,4252.0,9


In [289]:
print(f"(Raw DF)      Total Number of Records: {len(df_netflix)}")
print(f"(Raw DF)      Total Number of Missing Records for averageMovieRating: {df_model['averageMovieRating'].isna().sum()}")

print(f"(Cleansed DF) Total Number of Records: {len(df_netflix)}")
print(f"(Cleansed DF) Total Number of Missing Records for averageMovieRating: {df_model_final['averageMovieRating'].isna().sum()}")

(Raw DF)      Total Number of Records: 8805
(Raw DF)      Total Number of Missing Records for averageMovieRating: 3926
(Cleansed DF) Total Number of Records: 8805
(Cleansed DF) Total Number of Missing Records for averageMovieRating: 0


In [290]:
df_model_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8805 entries, 0 to 8804
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   show_id             8805 non-null   object 
 1   type                8805 non-null   object 
 2   title               8805 non-null   object 
 3   director            6172 non-null   object 
 4   cast                7980 non-null   object 
 5   country             7974 non-null   object 
 6   release_year        8805 non-null   int64  
 7   age_rating          8715 non-null   float64
 8   duration            6127 non-null   float64
 9   listed_in           8805 non-null   object 
 10  description         8805 non-null   object 
 11  seasons             2675 non-null   float64
 12  year_added          8795 non-null   float64
 13  month_added         8795 non-null   float64
 14  averageMovieRating  8805 non-null   float64
 15  numVotes            8805 non-null   float64
 16  bin_ra

#**EDA**

In [None]:
!pip install pandas-profiling
from pandas_profiling import ProfileReport

In [206]:
prof = ProfileReport(df_model_final)
prof

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [207]:
from sklearn.feature_extraction.text import TfidfVectorizer



In [209]:
df_model_final.head()

  and should_run_async(code)


Unnamed: 0,show_id,type,title,director,cast,country,release_year,age_rating,duration,listed_in,description,seasons,year_added,month_added,averageMovieRating,numVotes,bin_rating
0,s1,movie,dick johnson is dead,kirsten johnson,,united states,2020,4.0,90.0,documentaries,"as her father nears the end of his life, filmm...",,2021.0,9.0,6.724845,3750.09,6.0
1,s2,tv show,blood & water,,"ama qamata, khosi ngema, gail mabalane, thaban...",south africa,2021,6.0,,"international,dramas,mysteries","after crossing paths at a party, a cape town t...",2.0,2021.0,9.0,6.876004,10381.94,6.0
2,s3,tv show,ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",,2021,6.0,,"crime,international,action&adventure",to protect his family from a powerful drug lor...,1.0,2021.0,9.0,6.92081,14364.68,6.0
3,s4,tv show,jailbirds new orleans,,,,2021,6.0,,"docu,reality","feuds, flirtations and toilet talk go down amo...",1.0,2021.0,9.0,6.6,277.0,6.0
4,s5,tv show,kota factory,,"mayur more, jitendra kumar, ranjan raj, alam k...",india,2021,6.0,,"international,romantic,comedies",in a city of coaching centers known to train i...,2.0,2021.0,9.0,6.223327,10710.88,5.0


In [284]:
df_model_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8805 entries, 0 to 8804
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   show_id             8805 non-null   object 
 1   type                8805 non-null   object 
 2   title               8805 non-null   object 
 3   director            6172 non-null   object 
 4   cast                7980 non-null   object 
 5   country             7974 non-null   object 
 6   release_year        8805 non-null   int64  
 7   age_rating          8715 non-null   float64
 8   duration            6127 non-null   float64
 9   listed_in           8805 non-null   object 
 10  description         8805 non-null   object 
 11  seasons             2675 non-null   float64
 12  year_added          8795 non-null   float64
 13  month_added         8795 non-null   float64
 14  averageMovieRating  8805 non-null   object 
 15  numVotes            8805 non-null   float64
 16  bin_ra

  and should_run_async(code)


#**Recommendation**

In [291]:
df_out = df_model_final.copy()

df_out = df_out.fillna("")

features = ["title", "type", "director", "cast", "age_rating", "duration", "seasons", "listed_in", "description"]

df_out[features] = df_out[features].applymap(str)

df_out["concat_text"] = df_out[features].apply(lambda row: " ".join(row), axis=1)

vectorizer = TfidfVectorizer(max_df=0.7,            # drop words that occur in more than X percent of documents
                             min_df=1,              # only use words that appear at least X times
                             stop_words="english",  # remove stop words
                             lowercase=True,        # Convert everything to lower case
                             use_idf=True,          # Use idf
                             norm=u"l2",            # Normalization
                             smooth_idf=True        # Prevents divide-by-zero errors
                            )

text = df_out["concat_text"]

tfidf = vectorizer.fit_transform(text)

In [292]:
print(f"shape: {tfidf.shape}")
print(tfidf[0])

shape: (8805, 53064)
  (0, 22111)	0.2654772341972783
  (0, 15592)	0.15672509023211859
  (0, 20385)	0.12950182464674362
  (0, 50805)	0.17847463212060277
  (0, 10030)	0.24499604618233817
  (0, 22451)	0.24499604618233817
  (0, 11741)	0.14022499550621717
  (0, 44794)	0.21995999717290832
  (0, 16307)	0.18062453340664744
  (0, 27334)	0.09710586529929292
  (0, 14706)	0.16997417106462215
  (0, 32773)	0.2404411851878485
  (0, 15917)	0.12749617521297682
  (0, 13123)	0.09787512945083007
  (0, 415)	0.14853665011544473
  (0, 25371)	0.43584277665743665
  (0, 31762)	0.04022748112749547
  (0, 11714)	0.16997417106462215
  (0, 23637)	0.4709233285124305
  (0, 12617)	0.22214974756597808


  and should_run_async(code)


In [293]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf, tfidf)

In [294]:
indices = pd.Series(df_out.index, index=df_out['title']).drop_duplicates()

In [295]:
df_out.head()

  and should_run_async(code)


Unnamed: 0,show_id,type,title,director,cast,country,release_year,age_rating,duration,listed_in,description,seasons,year_added,month_added,averageMovieRating,numVotes,bin_rating,concat_text
0,s1,movie,dick johnson is dead,kirsten johnson,,united states,2020,4.0,90.0,documentaries,"as her father nears the end of his life, filmm...",,2021.0,9.0,6.7,3750.09,6,dick johnson is dead movie kirsten johnson 4....
1,s2,tv show,blood & water,,"ama qamata, khosi ngema, gail mabalane, thaban...",south africa,2021,6.0,,"international,dramas,mysteries","after crossing paths at a party, a cape town t...",2.0,2021.0,9.0,6.9,10381.94,6,"blood & water tv show ama qamata, khosi ngema..."
2,s3,tv show,ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",,2021,6.0,,"crime,international,action&adventure",to protect his family from a powerful drug lor...,1.0,2021.0,9.0,6.9,14364.68,6,ganglands tv show julien leclercq sami bouajil...
3,s4,tv show,jailbirds new orleans,,,,2021,6.0,,"docu,reality","feuds, flirtations and toilet talk go down amo...",1.0,2021.0,9.0,6.6,277.0,6,"jailbirds new orleans tv show 6.0 1.0 docu,..."
4,s5,tv show,kota factory,,"mayur more, jitendra kumar, ranjan raj, alam k...",india,2021,6.0,,"international,romantic,comedies",in a city of coaching centers known to train i...,2.0,2021.0,9.0,6.2,10710.88,5,"kota factory tv show mayur more, jitendra kum..."


In [324]:
def get_recommendations(title, min_release_year=1900, min_rating=1, cosine_sim=cosine_sim):
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df_out.iloc[movie_indices].loc[(df_out["averageMovieRating"] >= min_rating) & (df_out["release_year"] >= min_release_year) , ["title", "type", "director", "cast", "country", "release_year", "age_rating", "averageMovieRating"]]

In [None]:
# np.set_printoptions(threshold=np.inf)
print(df_model["title"].unique())

In [326]:
df_out[df_out["title"] == "the lord of the rings: the return of the king"]

Unnamed: 0,show_id,type,title,director,cast,country,release_year,age_rating,duration,listed_in,description,seasons,year_added,month_added,averageMovieRating,numVotes,bin_rating,concat_text
8402,s8405,movie,the lord of the rings: the return of the king,peter jackson,"elijah wood, ian mckellen, liv tyler, viggo mo...","new zealand, united states",2003,4.0,201.0,"action&adventure,sci-fi&fantasy",aragorn is revealed as the heir to the ancient...,,2020.0,1.0,9.0,1905684.0,9,the lord of the rings: the return of the king ...


In [327]:
get_recommendations("terminator salvation", min_release_year=2010, min_rating=5)

  and should_run_async(code)


Unnamed: 0,title,type,director,cast,country,release_year,age_rating,averageMovieRating
3551,the dark crystal: age of resistance,tv show,,"taron egerton, nathalie emmanuel, anya taylor-...","united kingdom, united states",2019,3.0,8.4
763,sweet tooth,tv show,,"nonso anozie, christian convery, adeel akhtar,...",united states,2021,5.0,7.8
2950,dragonheart: vengeance,movie,ivan silvestrini,"joseph millson, jack kane, helena bonham carte...",united states,2020,4.0,5.4
