In [None]:
# %% [markdown]
"""
# Netflix Content Analysis (2008-2021)

This notebook analyzes Netflix's catalog of TV shows and movies collected from Flixable, covering:
- Content types and categories
- Release trends over time
- Geographic distribution
- Ratings and durations
"""

# %%
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Configure display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 50)
sns.set_style('whitegrid')
%matplotlib inline

# %% [markdown]
"""
## Data Loading and Initial Inspection
"""
# %%
def load_netflix_data(filepath):
    """Load Netflix data with error handling and initial checks"""
    try:
        df = pd.read_csv(filepath, parse_dates=['Release_Date'])
        print(f"Dataset loaded successfully with {df.shape[0]} titles")
        
        # Basic data validation
        required_columns = ['Title', 'Category', 'Type', 'Release_Date', 'Rating']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
            
        return df
    except FileNotFoundError:
        print("Error: File not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred while loading the data: {str(e)}")
    return None

netflix_df = load_netflix_data("Netflix_Dataset.csv")

# %% [markdown]
"""
## Data Quality Assessment
"""
# %%
def assess_data_quality(df):
    """Check data quality and clean the dataset"""
    
    # Check for duplicates
    duplicate_count = df.duplicated().sum()
    if duplicate_count > 0:
        print(f"Found {duplicate_count} duplicate records - removing them")
        df.drop_duplicates(inplace=True)
    
    # Check for missing values
    print("\n=== Missing Values Summary ===")
    missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0])
    
    # Visualize missing data
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')
    plt.title('Missing Values Heatmap', fontsize=14)
    plt.show()
    
    # Prepare duration analysis
    if 'Duration' in df.columns:
        duration_split = df['Duration'].str.split(' ', expand=True)
        df['Duration_Minutes'] = pd.to_numeric(duration_split[0], errors='coerce')
        df['Duration_Unit'] = duration_split[1]
    
    # Extract release year
    df['Release_Year'] = df['Release_Date'].dt.year
    
    return df

netflix_df = assess_data_quality(netflix_df)
display(netflix_df.head())

# %% [markdown]
"""
## Content Analysis
"""
# %%
def analyze_netflix_content(df):
    """Perform comprehensive analysis of Netflix catalog"""
    
    # 1. House of Cards details
    print("\n=== 'House of Cards' Details ===")
    house_of_cards = df[df['Title'].str.contains('House of Cards', case=False, na=False)]
    display(house_of_cards[['Show_Id', 'Director']])
    
    # 2. Release trends by year
    print("\n=== Content Release Trends ===")
    yearly_releases = df['Release_Year'].value_counts().sort_index()
    plt.figure(figsize=(14, 6))
    yearly_releases.plot(kind='bar', color='#E50914')
    plt.title('Netflix Content Releases by Year', fontsize=14)
    plt.xlabel('Year')
    plt.ylabel('Number of Titles Released')
    plt.show()
    
    # 3. Movies vs TV Shows
    print("\n=== Movies vs TV Shows ===")
    category_dist = df['Category'].value_counts(normalize=True)
    print(f"Movies: {category_dist['Movie']:.1%}")
    print(f"TV Shows: {category_dist['TV Show']:.1%}")
    sns.countplot(data=df, x='Category', palette=['#E50914', '#221F1F'])
    plt.title('Content by Category')
    plt.show()
    
    # 4. Movies released in 2000
    print("\n=== Movies Released in 2000 ===")
    movies_2000 = df[(df['Category'] == 'Movie') & (df['Release_Year'] == 2000)]
    print(f"Found {len(movies_2000)} movies from 2000")
    display(movies_2000[['Title', 'Director', 'Rating']])
    
    # 5. Indian TV Shows
    print("\n=== TV Shows from India ===")
    india_tv = df[(df['Category'] == 'TV Show') & (df['Country'] == 'India')]
    print(f"Found {len(india_tv)} Indian TV shows")
    display(india_tv[['Title', 'Release_Year', 'Duration']])
    
    # 6. Top Directors
    print("\n=== Top 10 Directors ===")
    top_directors = df['Director'].value_counts().head(10)
    display(top_directors.to_frame('Number of Titles'))
    
    # 7. Combined filters (Comedies or UK)
    print("\n=== Comedies Movies OR UK Content ===")
    combined_filter = df[
        ((df['Category'] == 'Movie') & (df['Type'] == 'Comedies')) | 
        (df['Country'] == 'United Kingdom')
    ]
    display(combined_filter[['Title', 'Category', 'Type', 'Country']].head(10))
    
    # 8. Tom Cruise appearances
    print("\n=== Titles Featuring Tom Cruise ===")
    tom_cruise = df[df['Cast'].str.contains('Tom Cruise', na=False)]
    print(f"Tom Cruise appears in {len(tom_cruise)} titles")
    display(tom_cruise[['Title', 'Category', 'Release_Year']])
    
    # 9. Rating analysis
    print("\n=== Rating Distribution ===")
    rating_counts = df['Rating'].value_counts()
    display(rating_counts)
    
    # 9.1 TV-14 Movies in Canada
    print("\n=== TV-14 Rated Movies in Canada ===")
    tv14_ca = df[(df['Category'] == 'Movie') & (df['Rating'] == 'TV-14') & (df['Country'] == 'Canada')]
    print(f"Found {len(tv14_ca)} TV-14 movies from Canada")
    
    # 9.2 R-Rated TV Shows after 2018
    print("\n=== R-Rated TV Shows After 2018 ===")
    r_tv = df[(df['Category'] == 'TV Show') & (df['Rating'] == 'R') & (df['Release_Year'] > 2018)]
    print(f"Found {len(r_tv)} R-rated TV shows since 2019")
    
    # 10. Duration analysis
    print("\n=== Duration Analysis ===")
    print(f"Longest Movie: {df['Duration_Minutes'].max()} minutes")
    print(f"Average Movie Duration: {df['Duration_Minutes'].mean():.1f} minutes")
    
    # 11. Country with most TV Shows
    print("\n=== Country with Most TV Shows ===")
    tv_by_country = df[df['Category'] == 'TV Show']['Country'].value_counts()
    print(f"Top country: {tv_by_country.index[0]} with {tv_by_country[0]} shows")
    
    # 12. Sorting by year
    print("\n=== Earliest Released Titles ===")
    display(df.sort_values('Release_Year').head(5)[['Title', 'Category', 'Release_Year']])
    
    # 13. Complex filtering
    print("\n=== Dramas Movies OR Kids' TV Shows ===")
    complex_filter = df[
        ((df['Category'] == 'Movie') & (df['Type'] == 'Dramas')) | 
        ((df['Category'] == 'TV Show') & (df['Type'] == "Kids' TV"))
    ]
    display(complex_filter[['Title', 'Category', 'Type']].head(10))
    
    return df

netflix_df = analyze_netflix_content(netflix_df)

# %% [markdown]
"""
## Advanced Visualizations
"""
# %%
def create_advanced_visualizations(df):
    """Generate insightful visualizations of Netflix data"""
    
    # Content by rating
    plt.figure(figsize=(14, 6))
    sns.countplot(data=df, y='Rating', order=df['Rating'].value_counts().index, palette='Reds_r')
    plt.title('Content Distribution by Rating', fontsize=14)
    plt.xlabel('Number of Titles')
    plt.show()
    
    # Duration distribution
    plt.figure(figsize=(12, 6))
    sns.histplot(data=df[df['Category'] == 'Movie'], x='Duration_Minutes', bins=30, color='#E50914')
    plt.title('Movie Duration Distribution (Minutes)', fontsize=14)
    plt.xlabel('Duration (Minutes)')
    plt.show()
    
    # Top production countries
    top_countries = df['Country'].value_counts().head(10)
    plt.figure(figsize=(12, 6))
    top_countries.plot(kind='barh', color='#221F1F')
    plt.title('Top 10 Production Countries', fontsize=14)
    plt.xlabel('Number of Titles')
    plt.show()

create_advanced_visualizations(netflix_df)

# %% [markdown]
"""
## Key Findings Summary
1. **Content Growth**:
   - Peak release year: [YEAR] with [X] titles
   - [Y]% growth since [EARLIEST_YEAR]

2. **Catalog Composition**:
   - Movies: [A]%
   - TV Shows: [B]%
   - Most common rating: [RATING1]

3. **Geographic Insights**:
   - Top production country: [COUNTRY1]
   - [COUNTRY2] specializes in [CONTENT_TYPE]

4. **Creative Talent**:
   - Top director: [DIRECTOR] with [N] titles
   - Tom Cruise appears in [M] titles

5. **Duration Patterns**:
   - Longest movie: [MAX] minutes
   - Average movie length: [AVG] minutes
"""