### Import Dependencies

In [None]:
import kagglehub
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load the Netflix titles dataset from CSV file into a pandas DataFrame
df = pd.read_csv('netflix_titles.csv')

# Display the first 5 rows of the DataFrame to preview the data structure
# This helps understand the columns, data types, and get initial insights
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
# Display comprehensive summary information about the DataFrame
# Shows: number of entries, column names, non-null counts, data types, and memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [None]:
# Select only specific columns from the DataFrame for analysis
# Keeps only: title, director, cast, and description columns
# This reduces the dataset to focus on relevant features for the current task
df = df[['title', 'director', 'cast', 'description']]

# Calculate the number of missing values in each column
df.isnull().sum()

Unnamed: 0,0
title,0
director,2634
cast,825
description,0


In [None]:
# Handle missing values in the 'director' column by replacing NaN with empty space
# This ensures no missing data while maintaining the column structure for text processing
df['director'] = df['director'].fillna(' ')

# Handle missing values in the 'cast' column by replacing NaN with empty space
# Prevents null values from breaking text processing pipelines while preserving data integrity
df['cast'] = df['cast'].fillna(' ')

In [None]:
# Create a new column 'str_info' by combining multiple text columns into a single string
# The combined text will be used for content-based filtering and TF-IDF vectorization
# to enable movie recommendations based on similarity of movie attributes

df['str_info'] = df['title'] + ' ' + df['director'] + ' ' + df['cast'] + ' ' + df['description']
df['str_info']

Unnamed: 0,str_info
0,Dick Johnson Is Dead Kirsten Johnson As her ...
1,"Blood & Water Ama Qamata, Khosi Ngema, Gail ..."
2,"Ganglands Julien Leclercq Sami Bouajila, Tracy..."
3,"Jailbirds New Orleans Feuds, flirtations a..."
4,"Kota Factory Mayur More, Jitendra Kumar, Ran..."
...,...
8802,"Zodiac David Fincher Mark Ruffalo, Jake Gyllen..."
8803,Zombie Dumb While living alone in a spooky...
8804,"Zombieland Ruben Fleischer Jesse Eisenberg, Wo..."
8805,"Zoom Peter Hewitt Tim Allen, Courteney Cox, Ch..."


In [None]:
# Initialize TF-IDF Vectorizer to convert text data into numerical features
tfidf = TfidfVectorizer()

# Transform the combined text information into a TF-IDF feature matrix
# fit_transform() learns the vocabulary and IDF weights from the text data,
# then transforms the text into a sparse matrix where each row represents a movie
# and each column represents a TF-IDF score for a specific word in the vocabulary
matrix_tfidf = tfidf.fit_transform(df['str_info'])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 350550 stored elements and shape (8807, 53215)>

In [None]:
print(matrix_tfidf)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 350550 stored elements and shape (8807, 53215)>
  Coords	Values
  (0, 12550)	0.2150775093357489
  (0, 23636)	0.45593698797406484
  (0, 22569)	0.08837949344601481
  (0, 11645)	0.16456449180807528
  (0, 25371)	0.42196777097488825
  (0, 3250)	0.08650048102773192
  (0, 20460)	0.0787916947340342
  (0, 15869)	0.1234400444540753
  (0, 32794)	0.23278609582689921
  (0, 47349)	0.07830669861706371
  (0, 14650)	0.16456449180807528
  (0, 33973)	0.04786915226976016
  (0, 20802)	0.12894611903489175
  (0, 27337)	0.09401811257946623
  (0, 16263)	0.17487548357344307
  (0, 44874)	0.21295753467704076
  (0, 11672)	0.13576326473631503
  (0, 21958)	0.05183130312493094
  (0, 22446)	0.23719581755263408
  (0, 2301)	0.04417612908305705
  (0, 9957)	0.23719581755263408
  (0, 50928)	0.17279408824201017
  (0, 47832)	0.0447699409585643
  (0, 20362)	0.125381784474899
  (0, 47378)	0.12416430002039143
  :	:
  (8806, 527)	0.09685236398138443
  (8806, 24661)	0.

In [None]:
# Compute cosine similarity matrix between all movie pairs based on TF-IDF features
similarity_matrix = cosine_similarity(matrix_tfidf)
similarity_matrix

array([[1.00000000e+00, 9.01842810e-03, 1.82149243e-02, ...,
        7.86996905e-03, 1.17852298e-02, 2.61162831e-02],
       [9.01842810e-03, 1.00000000e+00, 8.27694492e-04, ...,
        9.93767487e-04, 0.00000000e+00, 0.00000000e+00],
       [1.82149243e-02, 8.27694492e-04, 1.00000000e+00, ...,
        5.29920652e-03, 5.65743735e-03, 3.08586401e-02],
       ...,
       [7.86996905e-03, 9.93767487e-04, 5.29920652e-03, ...,
        1.00000000e+00, 4.89109552e-02, 3.02687493e-03],
       [1.17852298e-02, 0.00000000e+00, 5.65743735e-03, ...,
        4.89109552e-02, 1.00000000e+00, 2.57099650e-03],
       [2.61162831e-02, 0.00000000e+00, 3.08586401e-02, ...,
        3.02687493e-03, 2.57099650e-03, 1.00000000e+00]])

### movie recommendation method return n_recomendations of movies with highest similarity

In [None]:
def movie_recommendation(title, n_recomendations=5):

  # Check if the requested movie title exists in the dataset
  if title not in df['title'].values:
    return 'Movie not found'

  # Get the index position of the requested movie in the DataFrame
  idx_movie = df[df['title'] == title].index[0]

  # Retrieve similarity scores between the requested movie and all other movies
  # enumerate() pairs each movie index with its similarity score
  similarity_scores = list(enumerate(similarity_matrix[idx_movie]))

  filter_similarity = []

  for idx, score in similarity_scores:
    # Exclude the movie itself (idx != idx_movie)
    # and only include meaningful similarities (score > 0.05 threshold)
    if idx != idx_movie and score > 0.05:
      filter_similarity.append((idx, score))

  # Sort movies by similarity score in descending order and select top N
  sorted_similarity = sorted(filter_similarity, key=lambda x: x[1], reverse=True)[:n_recomendations]

  # Display formatted recommendations with bold text formatting
  print("\033[1mRecomended movies:\033[0m")
  for idx, score in sorted_similarity:
    print("\033[1mtitle: \033[0m" + df.iloc[idx]['title'] + f" ---> \033[1msimilarity:\033[0m {round(score, 4)}")



movie_recommendation('Iron Man: Armored Adventures',5)

[1mRecomended movies:[0m
[1mtitle: [0mAvengers Climate Conundrum ---> [1msimilarity:[0m 0.1191
[1mtitle: [0mIron Man & Captain America: Heroes United ---> [1msimilarity:[0m 0.1152
[1mtitle: [0mMarvel's Iron Man & Hulk: Heroes United ---> [1msimilarity:[0m 0.1079
[1mtitle: [0mDEATH NOTE ---> [1msimilarity:[0m 0.1003
[1mtitle: [0mTobot Galaxy Detectives ---> [1msimilarity:[0m 0.0996
