In [None]:
# Netflix Recommendation System


# Importing essential libraries for data processing, recommendation, and visualization
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import warnings
warnings.filterwarnings('ignore')

# Setting up a professional visualization theme
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("muted")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['font.family'] = 'Arial'

# --- 1. Data Loading and Preprocessing ---
# Load the Netflix dataset
df = pd.read_csv('netflixData.csv')

# Display dataset overview
print("Dataset Overview:")
print(f"Shape: {df.shape}")
print("\nColumn Types:")
print(df.dtypes)
print("\nFirst 5 Rows:")
print(df.head())

# Handle missing values
df['Director'].fillna('Unknown', inplace=True)
df['Cast'].fillna('Unknown', inplace=True)
df['Description'].fillna('', inplace=True)
df['Genres'].fillna('', inplace=True)

# Clean text data
def clean_text(text):
    """Clean text by removing special characters and converting to lowercase."""
    if not isinstance(text, str):
        return ''
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower().strip()
    return text

# Combine relevant features for recommendation
df['Combined_Features'] = df.apply(
    lambda x: f"{clean_text(x['Genres'])} {clean_text(x['Description'])} {clean_text(x['Cast'])} {clean_text(x['Director'])}",
    axis=1
)

# --- 2. Exploratory Data Analysis (EDA) ---
# Genre distribution
genres = df['Genres'].str.split(', ', expand=True).stack().value_counts().head(10)
fig1 = px.bar(x=genres.index, y=genres.values, 
              title='Top 10 Genres on Netflix',
              labels={'x': 'Genre', 'y': 'Number of Titles'},
              color=genres.index, color_discrete_sequence=px.colors.qualitative.Set2)
fig1.update_layout(showlegend=False, height=500, title_x=0.5)
fig1.show()

# Rating distribution
fig2 = px.histogram(df, x='Rating', title='Distribution of Content Ratings',
                    labels={'Rating': 'Content Rating', 'count': 'Number of Titles'},
                    color_discrete_sequence=['#636EFA'])
fig2.update_layout(height=500, title_x=0.5)
fig2.show()

# Content type analysis
fig3 = px.pie(df, names='Content Type', title='Movies vs. TV Shows',
              color_discrete_sequence=px.colors.qualitative.Pastel)
fig3.update_layout(height=500, title_x=0.5)
fig3.show()

# --- 3. Recommendation System ---
# Create TF-IDF matrix for combined features
tfidf = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=2)
tfidf_matrix = tfidf.fit_transform(df['Combined_Features'])

# Compute cosine similarity
similarity = cosine_similarity(tfidf_matrix)

# Create title-to-index mapping
indices = pd.Series(df.index, index=df['Title']).drop_duplicates()

def netflix_recommendation(title, similarity=similarity, top_n=10):
    """Recommend similar Netflix titles based on input title."""
    if title not in indices:
        return f"Title '{title}' not found in the dataset."
    
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:top_n+1]  # Exclude the input title
    movie_indices = [i[0] for i in similarity_scores]
    
    # Return recommended titles with additional details
    recommendations = df.iloc[movie_indices][['Title', 'Genres', 'Content Type', 'Imdb Score', 'Duration']]
    recommendations['Similarity Score'] = [score[1] for score in similarity_scores]
    return recommendations

# Example recommendation
print("\nRecommendations for '#Alive':")
print(netflix_recommendation('#Alive'))

# --- 4. Key Insights ---
print("\nKey Insights for Presentation:")
print("1. Dataset: Contains 15480 Netflix titles with features like genres, descriptions, cast, and directors.")
print("2. Genre Trends: International Movies, Dramas, and Comedies dominate the platform.")
print("3. Content Ratings: TV-MA and TV-14 are the most common ratings, indicating mature audiences.")
print("4. Recommendation System: Uses genres, descriptions, cast, and directors for accurate content-based recommendations.")
print("5. Future Work: Incorporate user viewing history or collaborative filtering for personalized recommendations.")

# --- 5. Save Outputs ---
# Save processed dataset
df.to_csv('processed_netflix_data.csv', index=False)
print("\nProcessed dataset saved as 'processed_netflix_data.csv'.")

# Save visualizations as HTML
fig1.write_html('genre_distribution.html')
fig2.write_html('rating_distribution.html')
fig3.write_html('content_type.html')
print("Interactive visualizations saved as HTML files.")