In [None]:
# %% [markdown]
"""
# Udemy Courses Dataset Analysis

This notebook analyzes Udemy's course catalog data including:
- Course subjects and titles
- Pricing information
- Subscription numbers
- Publication dates
- Difficulty levels
"""

# %%
import pandas as pd
import numpy as np

# Configure display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.float_format', '{:,.0f}'.format)

# %% [markdown]
"""
## Data Loading and Initial Inspection
"""
# %%
def load_udemy_data(filepath):
    """Load Udemy data with error handling and initial checks"""
    try:
        df = pd.read_csv(filepath, parse_dates=['published_timestamp'])
        print(f"Dataset loaded successfully with {df.shape[0]} courses and {df.shape[1]} features")
        
        # Basic data validation
        required_columns = ['course_title', 'subject', 'is_paid', 'price', 
                          'num_subscribers', 'level', 'published_timestamp']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
            
        return df
    except FileNotFoundError:
        print("Error: File not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred while loading the data: {str(e)}")
    return None

udemy_df = load_udemy_data("Udemy_Dataset.csv")

# %% [markdown]
"""
## Data Preparation
"""
# %%
def prepare_udemy_data(df):
    """Prepare Udemy data for analysis"""
    # Ensure proper data types
    if not pd.api.types.is_datetime64_any_dtype(df['published_timestamp']):
        df['published_timestamp'] = pd.to_datetime(df['published_timestamp'])
    
    # Extract year from timestamp
    df['publication_year'] = df['published_timestamp'].dt.year
    
    # Clean price column if needed (assuming it might be string with $ signs)
    if pd.api.types.is_string_dtype(df['price']):
        df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
    
    return df

udemy_df = prepare_udemy_data(udemy_df)
display(udemy_df.head())

# %% [markdown]
"""
## Course Catalog Analysis
"""
# %%
def analyze_udemy_catalog(df):
    """Perform comprehensive analysis of Udemy course catalog"""
    
    # 1. Different subjects offered
    print("\n=== Subjects Offered ===")
    subjects = df['subject'].unique()
    print(f"Udemy offers courses in {len(subjects)} subjects:")
    print(subjects)
    
    # 2. Subject with most courses
    print("\n=== Subject Popularity ===")
    subject_counts = df['subject'].value_counts()
    max_subject = subject_counts.idxmax()
    print(f"'{max_subject}' has the most courses: {subject_counts.max()}")
    display(subject_counts.to_frame('Number of Courses'))
    
    # 3 & 4. Free vs Paid courses
    print("\n=== Pricing Analysis ===")
    pricing_dist = df['is_paid'].value_counts(normalize=True)
    print(f"Free courses: {pricing_dist[False]:.1%}")
    print(f"Paid courses: {pricing_dist[True]:.1%}")
    
    # 5 & 6. Top and least selling courses
    print("\n=== Top 5 Selling Courses ===")
    display(df.nlargest(5, 'num_subscribers')[['course_title', 'subject', 'num_subscribers']])
    
    print("\n=== 5 Least Selling Courses ===")
    display(df.nsmallest(5, 'num_subscribers')[['course_title', 'subject', 'num_subscribers']])
    
    # 7. Graphic Design courses under $100
    print("\n=== Affordable Graphic Design Courses (<$100) ===")
    graphic_design = df[(df['subject'] == 'Graphic Design') & (df['price'] < 100)]
    print(f"Found {len(graphic_design)} courses priced under $100")
    display(graphic_design[['course_title', 'price', 'num_subscribers']].sort_values('price'))
    
    # 8. Python-related courses
    print("\n=== Python-Related Courses ===")
    python_courses = df[df['course_title'].str.contains('Python', case=False)]
    print(f"Found {len(python_courses)} Python-related courses")
    display(python_courses[['course_title', 'subject', 'num_subscribers']].sort_values('num_subscribers', ascending=False))
    
    # 9. Courses published in 2015
    print("\n=== Courses Published in 2015 ===")
    courses_2015 = df[df['publication_year'] == 2015]
    print(f"Found {len(courses_2015)} courses published in 2015")
    display(courses_2015[['course_title', 'subject', 'num_subscribers']].sort_values('num_subscribers', ascending=False))
    
    # 10. Max subscribers by course level
    print("\n=== Maximum Subscribers by Course Level ===")
    level_stats = df.groupby('level')['num_subscribers'].agg(['max', 'mean', 'count'])
    level_stats.columns = ['Max Subscribers', 'Average Subscribers', 'Course Count']
    display(level_stats.sort_values('Max Subscribers', ascending=False))
    
    return df

udemy_df = analyze_udemy_catalog(udemy_df)

# %% [markdown]
"""
## Advanced Analyses
"""
# %%
def advanced_udemy_analyses(df):
    """Perform deeper analyses on Udemy data"""
    
    # Price distribution analysis
    print("\n=== Price Distribution ===")
    price_stats = df[df['is_paid']].groupby('subject')['price'].agg(['min', 'median', 'max'])
    display(price_stats.style.format('${:.2f}'))
    
    # Publication trends over time
    print("\n=== Publication Trends ===")
    publication_trends = df.groupby('publication_year').size()
    publication_trends.plot(kind='bar', title='Courses Published by Year')
    plt.show()
    
    # Subject popularity over time
    print("\n=== Subject Popularity Over Time ===")
    subject_yearly = df.groupby(['publication_year', 'subject']).size().unstack()
    subject_yearly.plot(kind='area', stacked=True, figsize=(12, 6))
    plt.title('Course Subjects by Publication Year')
    plt.show()
    
    return df

udemy_df = advanced_udemy_analyses(udemy_df)

# %% [markdown]
"""
## Key Findings Summary
1. **Catalog Composition**:
   - Most popular subject: [SUBJECT1] with [X] courses
   - [Y]% of courses are paid, average price: $[Z]

2. **Popular Courses**:
   - Top course: [COURSE1] with [A] subscribers
   - Python-related courses: [B] total offerings

3. **Trends**:
   - Peak publication year: [YEAR] with [C] courses
   - Growing subject: [SUBJECT2] with [D]% annual increase

4. **Pricing Insights**:
   - Most expensive subject: [SUBJECT3] (max $[E])
   - Best value: [SUBJECT4] courses under $[F]
"""

# %% [markdown]
"""
## Recommendations
1. Consider developing more courses in [high-demand subject]
2. Review pricing strategy for [specific subject] courses
3. Promote older (2015) high-quality courses that may be buried
4. Analyze why [low-performing courses] have few subscribers
"""