In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("C://Users/admin/Documents/Netflix/netflix_forsql.csv")

In [8]:
df.isnull().sum()

show_id                   0
type                      0
title                     0
director                  0
country                   0
date_added                0
date(Corrected format)    0
release_year              0
rating                    0
duration                  0
listed_in                 0
dtype: int64

In [12]:
# Fill missing values
df['director'].fillna('Not Given', inplace=True)
df['listed_in'].fillna('Not Given', inplace=True)
df['country'].fillna('Not Given', inplace=True)

In [16]:
# Convert 'date_added' to datetime
df['date(Corrected format)'] = pd.to_datetime(df['date(Corrected format)'], errors='coerce')

In [24]:
# Convert 'duration' to minutes
def convert_duration(x):
    try:
        if 'min' in x:
            return int(x.split()[0])
        elif 'Season' in x:
            return int(x.split()[0]) * 60  # Assume 1 season = 60 mins
        else:
            return np.nan
    except:
        return np.nan

df['duration_mins'] = df['duration'].apply(convert_duration)

In [30]:
# Extract year
df['year'] = df['date(Corrected format)'].dt.year

# Count genres per row
df['num_genres'] = df['listed_in'].apply(lambda x: len(str(x).split(',')))

In [41]:
# Prepare genres for TF-IDF
df['genres_clean'] = df['listed_in'].fillna('').str.replace(',', ' ')

In [46]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['genres_clean'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [51]:
# Recommendation function
def get_recommendations(title, cosine_sim=cosine_sim):
    title = title.lower()
    if title not in df['title'].str.lower().values:
        return "Title not found in the dataset."
    idx = df[df['title'].str.lower() == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:6]
    rec_indexes = [i[0] for i in sim_scores]
    return df[['title', 'listed_in', 'type']].iloc[rec_indexes]


In [59]:
print("Recommendations for 'Narcos':")
print(get_recommendations('Narcos'))

Recommendations for 'Narcos':
                       title  \
7219      Queen of the South   
7342          Altered Carbon   
7356          Narcos: Mexico   
7656                 Shooter   
7670  Marvel's Jessica Jones   

                                             listed_in     type  
7219  Crime TV Shows, TV Action & Adventure, TV Dramas  TV Show  
7342  Crime TV Shows, TV Action & Adventure, TV Dramas  TV Show  
7356  Crime TV Shows, TV Action & Adventure, TV Dramas  TV Show  
7656  Crime TV Shows, TV Action & Adventure, TV Dramas  TV Show  
7670  Crime TV Shows, TV Action & Adventure, TV Dramas  TV Show  


In [61]:
#Clustering Netflix Titles
features = df[['duration_mins', 'year', 'num_genres']].fillna(0)

In [66]:
# Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [73]:
# Apply KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(scaled_features)
