In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score

from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv("NetflixSimple.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [3]:
for col in ['description', 'listed_in', 'cast', 'director']:
    df[col] = df[col].fillna("")

In [4]:
df['combined_text'] = (df['listed_in'] + " " + df['description'] + " " + df['cast'] + " " + df['director'])
df[['title', 'combined_text']].head()

Unnamed: 0,title,combined_text
0,3%,"International TV Shows, TV Dramas, TV Sci-Fi &..."
1,7:19,"Dramas, International Movies After a devastati..."
2,23:59,"Horror Movies, International Movies When an ar..."
3,9,"Action & Adventure, Independent Movies, Sci-Fi..."
4,21,Dramas A brilliant group of students become ca...


In [5]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z ]', ' ', text)
    words = [w for w in text.split() if w not in stop_words]
    return " ".join(words)
df['processed_text'] = df['combined_text'].apply(preprocess)
df[['title', 'processed_text']].head()

Unnamed: 0,title,processed_text
0,3%,international tv shows tv dramas tv sci fi fan...
1,7:19,dramas international movies devastating earthq...
2,23:59,horror movies international movies army recrui...
3,9,action adventure independent movies sci fi fan...
4,21,dramas brilliant group students become card co...


In [6]:
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1, 2), min_df=2)
tfidf_matrix = tfidf.fit_transform(df['processed_text'])
print("TF-IDF shape:", tfidf_matrix.shape)

TF-IDF shape: (7787, 3000)


In [7]:
svd_model = TruncatedSVD(n_components=100, random_state=1)
compressed_features = svd_model.fit_transform(tfidf_matrix)
print("Compressed Feature Shape:", compressed_features.shape)

Compressed Feature Shape: (7787, 100)


In [8]:
total_clusters = min(6, len(df))
cluster_model = KMeans(n_clusters=total_clusters, random_state=1)
df['group_id'] = cluster_model.fit_predict(compressed_features)
print("Clusters Used:", total_clusters)
df[['title', 'group_id']]

Clusters Used: 6


Unnamed: 0,title,group_id
0,3%,1
1,7:19,3
2,23:59,3
3,9,3
4,21,3
...,...,...
7782,Zozo,3
7783,Zubaan,3
7784,Zulu Man in Japan,3
7785,Zumbo's Just Desserts,1


In [9]:
similarity_scores = cosine_similarity(compressed_features)
def suggest_titles(movie_name, limit=3):
    if movie_name not in df['title'].values:
        return "Movie not found"
    position = df[df['title'] == movie_name].index[0]
    ranked_scores = list(enumerate(similarity_scores[position]))
    ranked_scores = sorted(ranked_scores, key=lambda x: x[1], reverse=True)
    ranked_scores = ranked_scores[1:limit+1]
    
    return [df.iloc[i[0]]['title'] for i in ranked_scores]

In [11]:
suggest_titles("3 Idiots")

['Dil Dhadakne Do', 'PK', 'Upstarts']

In [12]:
df[['title', 'type', 'listed_in', 'group_id']].head(10).reset_index(drop=True)

Unnamed: 0,title,type,listed_in,group_id
0,3%,TV Show,"International TV Shows, TV Dramas, TV Sci-Fi &...",1
1,7:19,Movie,"Dramas, International Movies",3
2,23:59,Movie,"Horror Movies, International Movies",3
3,9,Movie,"Action & Adventure, Independent Movies, Sci-Fi...",3
4,21,Movie,Dramas,3
5,46,TV Show,"International TV Shows, TV Dramas, TV Mysteries",1
6,122,Movie,"Horror Movies, International Movies",3
7,187,Movie,Dramas,3
8,706,Movie,"Horror Movies, International Movies",3
9,1920,Movie,"Horror Movies, International Movies, Thrillers",3
