In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Set Plot Style
plt.style.use('seaborn-v0_8-darkgrid')

# Load and Clean Data
def clean_data(df):
    df['director'] = df['director'].fillna('No Director')
    df['cast'] = df['cast'].fillna('No Cast')
    df['country'] = df['country'].fillna('No Country')
    df['rating'] = df['rating'].fillna(method='ffill')
    df['description'] = df['description'].str.replace('[^a-zA-Z]', ' ', regex=True).str.lower()
    df['listed_in'] = df['listed_in'].fillna('No Genre')  # Handle genres
    return df

# Preprocess Text and Reduce Dimensions
def preprocess_descriptions(df):
    tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
    tfidf_matrix = tfidf.fit_transform(df['description'])
    svd = TruncatedSVD(n_components=50, random_state=42)
    reduced = svd.fit_transform(tfidf_matrix)
    return tfidf, tfidf_matrix, reduced

# Clustering Function
def perform_clustering(data, method='kmeans', n_clusters=4):
    model = KMeans(n_clusters=n_clusters, random_state=42)
    labels = model.fit_predict(data)
    return model, labels

# Recommendation Function
def recommend_titles(title, df, tfidf_matrix, labels, n_recommendations=5):
    if title not in df['title'].values:
        return f"Title '{title}' not found in the dataset."
    
    idx = df.index[df['title'] == title].tolist()[0]
    title_cluster = labels[idx]
    cluster_indices = df.index[labels == title_cluster].tolist()
    
    similarities = cosine_similarity(tfidf_matrix[idx:idx+1], tfidf_matrix[cluster_indices]).flatten()
    similar_indices = np.argsort(similarities)[::-1][1:n_recommendations+1]
    recommended_titles = df.iloc[[cluster_indices[i] for i in similar_indices]][['title', 'listed_in']].values
    
    return recommended_titles

# Cluster Profiling: Analyze Genres and Countries
def analyze_cluster_profiles(df, labels, n_clusters=4):
    profiles = []
    for i in range(n_clusters):
        cluster_data = df[labels == i]
        # Top genres
        genres = cluster_data['listed_in'].str.split(', ', expand=True).stack().value_counts().head(5)
        # Top countries
        countries = cluster_data['country'].str.split(', ', expand=True).stack().value_counts().head(5)
        profiles.append({
            'Cluster': i,
            'Top Genres': genres.to_dict(),
            'Top Countries': countries.to_dict()
        })
        print(f"\nCluster {i} Profile:")
        print(f"Top Genres: {', '.join([f'{g} ({c})' for g, c in genres.items()])}")
        print(f"Top Countries: {', '.join([f'{c} ({n})' for c, n in countries.items()])}")
    return profiles

# Main Analysis
def main():
    # Load Data
    try:
        netflix = pd.read_csv("C:\\Users\\abhiv\\Downloads\\NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv")
    except FileNotFoundError:
        print("Error: 'NETFLIX_MOVIES_AND_TV_SHOWS.csv' not found. Please ensure the file is in the same directory.")
        return
    
    netflix = clean_data(netflix)
    
    # Preprocess and Cluster
    tfidf, tfidf_matrix, tfidf_reduced = preprocess_descriptions(netflix)
    kmeans_model, kmeans_labels = perform_clustering(tfidf_reduced, method='kmeans', n_clusters=4)
    netflix['KMeans_Cluster'] = kmeans_labels
    
    # Evaluate Clustering
    print("Silhouette Score (KMeans):", silhouette_score(tfidf_reduced, kmeans_labels))
    
    # Top Terms per Cluster
    feature_names = tfidf.get_feature_names_out()
    centers = kmeans_model.cluster_centers_
    print("\nTop Terms per KMeans Cluster:")
    for i in range(centers.shape[0]):
        top_indices = centers[i].argsort()[-10:][::-1]
        top_terms = [feature_names[j] for j in top_indices]
        print(f"Cluster {i}: {', '.join(top_terms)}")
    
    # Interactive Cluster Visualization
    tsvd = TruncatedSVD(n_components=2)
    projected = tsvd.fit_transform(tfidf_matrix)
    plot_df = pd.DataFrame({
        'Component 1': projected[:, 0],
        'Component 2': projected[:, 1],
        'Cluster': kmeans_labels,
        'Title': netflix['title'],
        'Description': netflix['description'],
        'Genre': netflix['listed_in']
    })
    fig = px.scatter(plot_df, x='Component 1', y='Component 2', color='Cluster',
                     hover_data=['Title', 'Description', 'Genre'],
                     title="KMeans Clusters of Netflix Content",
                     color_continuous_scale='Viridis')
    fig.update_layout(showlegend=True)
    fig.update_xaxes(title_text="Component 1")
    fig.update_yaxes(title_text="Component 2")
    fig.update_traces(marker=dict(size=8))
    fig.write_html("netflix_clusters.html")  # Save for portfolio
    
    # Cluster Profiling
    print("\nCluster Profiles:")
    profiles = analyze_cluster_profiles(netflix, kmeans_labels)
    
    # Genre Distribution Bar Chart
    genre_counts = []
    for i in range(4):
        cluster_data = netflix[netflix['KMeans_Cluster'] == i]
        top_genres = cluster_data['listed_in'].str.split(', ', expand=True).stack().value_counts().head(5)
        for genre, count in top_genres.items():
            genre_counts.append({'Cluster': f'Cluster {i}', 'Genre': genre, 'Count': count})
    
    genre_df = pd.DataFrame(genre_counts)
    fig_genre = px.bar(genre_df, x='Cluster', y='Count', color='Genre',
                       title="Top Genres by Cluster",
                       barmode='group',
                       color_discrete_sequence=px.colors.qualitative.Set2)
    fig_genre.update_layout(xaxis_title="Cluster", yaxis_title="Number of Titles")
    fig_genre.write_html("netflix_genre_distribution.html")  # Save for portfolio
    
    # Save Profiles to CSV
    profile_df = pd.DataFrame([
        {'Cluster': p['Cluster'], 'Top Genres': str(p['Top Genres']), 'Top Countries': str(p['Top Countries'])}
        for p in profiles
    ])
    profile_df.to_csv("netflix_cluster_profiles.csv", index=False)
    
    # Example Recommendation
    sample_title = netflix['title'].iloc[0]  # Pick first title for demo
    print(f"\nRecommendations for '{sample_title}':")
    recommendations = recommend_titles(sample_title, netflix, tfidf_matrix, kmeans_labels)
    if isinstance(recommendations, str):
        print(recommendations)
    else:
        for i, (title, genres) in enumerate(recommendations, 1):
            print(f"{i}. {title} (Genres: {genres})")
    
    # Save Recommendations to CSV
    if not isinstance(recommendations, str):
        rec_df = pd.DataFrame(recommendations, columns=['Title', 'Genres'])
        rec_df.to_csv(f"recommendations_{sample_title.replace('/', '_')}.csv", index=False)

if __name__ == "__main__":
    main()

  super()._check_params_vs_input(X, default_n_init=10)


Silhouette Score (KMeans): 0.09886674954224818

Top Terms per KMeans Cluster:
Cluster 0: abandoned, accident, accidentally, africa, adventure, alien, america, age, animated, agents
Cluster 1: abandoned, accidentally, actor, accused, actress, action, activist, adventures, ago, apart
Cluster 2: abandoned, academy, adventures, age, aged, activist, accused, act, adventure, alien
Cluster 3: abandoned, act, affair, ancient, action, accused, arranged, actor, actress, army

Cluster Profiles:

Cluster 0 Profile:
Top Genres: Dramas (154), International Movies (136), Comedies (129), International TV Shows (80), Independent Movies (71)
Top Countries: United States (300), India (47), Canada (45), United Kingdom (41), No Country (33)

Cluster 1 Profile:
Top Genres: International Movies (195), Dramas (189), International TV Shows (116), Comedies (113), Documentaries (84)
Top Countries: United States (274), India (68), No Country (57), United Kingdom (55), Spain (25)

Cluster 2 Profile:
Top Genres: In