In [None]:
"""""
Author: Ahmed Alghaith
Date: August 2025
"""

# Exploratory Data Analysis for Music Streaming Churn

This notebook provides comprehensive exploratory data analysis for the music streaming churn prediction dataset.

**Author:** Ahmed Alghaith  
**Date:** August 2025

## Setup and Data Loading

In [None]:
# Import required modules
from utils import *
from MusicStreamingEventProcessor import MusicStreamingEventProcessor

# Setup plotting style
setup_plotting_style()

print("📥 Loading customer churn event data...")
# Load your data here - replace 'customer_churn.json' with your actual file path
try:
    events_df = pd.read_json('customer_churn.json', lines=True)
    print("✅ Data loaded successfully!")
    print(f"📊 Loaded {len(events_df):,} events")
except FileNotFoundError:
    print("❌ Data file not found. Please update the file path in the cell above.")
    print("💡 Expected file: 'customer_churn.json'")

## Basic Data Exploration

In [None]:
# Basic data info
print(f"📊 Dataset shape: {events_df.shape}")
print(f"👥 Unique users: {events_df['userId'].nunique()}")
print(f"📋 Columns: {list(events_df.columns)}")

# Display first few rows
print("\n🔍 Sample Data:")
display(events_df.head())

# Display basic statistics
print("\n📈 Basic Statistics:")
display(events_df.describe())

# Check for missing values
print("\n❓ Missing Values:")
missing_data = events_df.isnull().sum()
missing_summary = missing_data[missing_data > 0]
if len(missing_summary) > 0:
    print(missing_summary)
else:
    print("✅ No missing values found")

## User Behavior Analysis

In [None]:
# Analyze user activity patterns
print("🎵 User Activity Analysis")

# Page visits distribution
if 'page' in events_df.columns:
    page_counts = events_df['page'].value_counts()
    print(f"\n📊 Total unique pages: {len(page_counts)}")
    print("\nTop 10 most visited pages:")
    print(page_counts.head(10))
    
    # Visualize page visits
    plt.figure(figsize=(12, 6))
    page_counts.head(15).plot(kind='bar')
    plt.title('Top 15 Most Visited Pages')
    plt.xlabel('Page')
    plt.ylabel('Number of Visits')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("⚠️ 'page' column not found in the data")

## Subscription Level Analysis

In [None]:
# Analyze subscription levels
if 'level' in events_df.columns:
    level_distribution = events_df['level'].value_counts()
    print("📊 Subscription Level Distribution:")
    print(level_distribution)
    
    # Visualize subscription levels
    plt.figure(figsize=(8, 6))
    level_distribution.plot(kind='pie', autopct='%1.1f%%')
    plt.title('Distribution of Subscription Levels')
    plt.ylabel('')
    plt.show()
    
    # User-level subscription analysis
    user_levels = events_df.groupby('userId')['level'].agg(['first', 'last', 'nunique'])
    level_changers = (user_levels['nunique'] > 1).sum()
    print(f"\n👥 Users who changed subscription levels: {level_changers}")
    print(f"📈 Percentage of level changers: {level_changers/len(user_levels)*100:.2f}%")
else:
    print("⚠️ 'level' column not found in the data")

## Feature Engineering and User Aggregation

In [None]:
# Initialize and run the processor
print("🏭 Processing events to create user features...")
processor = MusicStreamingEventProcessor()

try:
    user_features_df = processor.process_events_to_features(events_df)
    
    print(f"\n👥 User features shape: {user_features_df.shape}")
    print(f"📊 Features created: {len(user_features_df.columns)}")
    
    # Display column names
    print(f"\n🔢 Feature columns:")
    for i, col in enumerate(user_features_df.columns, 1):
        print(f"  {i:2d}. {col}")
    
    # Display sample of user features
    print("\n📋 Sample User Features:")
    display(user_features_df.head())
    
except Exception as e:
    print(f"❌ Error processing events: {str(e)}")
    print("💡 Please check your data format and try again")

## Churn Analysis

In [None]:
# Analyze churn distribution
if 'user_features_df' in locals() and 'churn' in user_features_df.columns:
    churn_distribution = user_features_df['churn'].value_counts()
    print("📊 Churn Distribution:")
    print(churn_distribution)
    print(f"📈 Churn Rate: {churn_distribution.get(1, 0) / len(user_features_df) * 100:.2f}%")
    
    # Visualize churn distribution
    plt.figure(figsize=(8, 6))
    churn_distribution.plot(kind='bar')
    plt.title('Churn Distribution')
    plt.xlabel('Churn (0=No, 1=Yes)')
    plt.ylabel('Number of Users')
    plt.xticks(rotation=0)
    plt.show()
    
    # Additional churn insights
    if len(churn_distribution) > 1:
        print(f"\n💡 Churn Insights:")
        print(f"   • Churned users: {churn_distribution.get(1, 0):,}")
        print(f"   • Active users: {churn_distribution.get(0, 0):,}")
        print(f"   • Class balance ratio: {churn_distribution.get(0, 0) / max(churn_distribution.get(1, 1), 1):.2f}:1")
else:
    print("⚠️ Churn column not found. Run feature engineering first.")

## Feature Correlations

In [None]:
# Analyze feature correlations
if 'user_features_df' in locals():
    numeric_features = user_features_df.select_dtypes(include=[np.number])
    
    if len(numeric_features.columns) > 1:
        # Correlation matrix
        plt.figure(figsize=(12, 10))
        correlation_matrix = numeric_features.corr()
        
        # Create heatmap
        mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))  # Show only lower triangle
        sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0, 
                    square=True, linewidths=0.5, cbar_kws={'shrink': 0.8}, fmt='.2f')
        plt.title('Feature Correlation Matrix')
        plt.tight_layout()
        plt.show()
        
        # Show high correlations with churn if available
        if 'churn' in correlation_matrix.columns:
            churn_correlations = correlation_matrix['churn'].abs().sort_values(ascending=False)
            print("\n🎯 Top features correlated with churn:")
            top_correlations = churn_correlations[churn_correlations.index != 'churn'].head(10)
            for feature, corr in top_correlations.items():
                print(f"  {feature}: {corr:.4f}")
        
        # Show highly correlated feature pairs (potential multicollinearity)
        print("\n🔗 Highly correlated feature pairs (|r| > 0.8):")
        high_corr_pairs = []
        for i in range(len(correlation_matrix.columns)):
            for j in range(i+1, len(correlation_matrix.columns)):
                corr_val = correlation_matrix.iloc[i, j]
                if abs(corr_val) > 0.8:
                    high_corr_pairs.append((correlation_matrix.columns[i], 
                                           correlation_matrix.columns[j], 
                                           corr_val))
        
        if high_corr_pairs:
            for feat1, feat2, corr_val in high_corr_pairs:
                print(f"  {feat1} ↔ {feat2}: {corr_val:.4f}")
        else:
            print("  ✅ No highly correlated pairs found")
            
    else:
        print("⚠️ Insufficient numeric features for correlation analysis")
else:
    print("⚠️ User features not available. Run feature engineering first.")

## Summary and Next Steps

In [None]:
# Provide summary and recommendations
print("📋 EXPLORATORY DATA ANALYSIS SUMMARY")
print("=" * 50)

if 'user_features_df' in locals():
    print(f"✅ Successfully processed {len(events_df):,} events")
    print(f"✅ Created features for {len(user_features_df):,} users")
    print(f"✅ Generated {len(user_features_df.columns)} features")
    
    if 'churn' in user_features_df.columns:
        churn_rate = user_features_df['churn'].mean()
        print(f"✅ Detected churn rate: {churn_rate:.2%}")
        
        if 0.05 <= churn_rate <= 0.50:
            print("✅ Churn rate is suitable for machine learning")
        else:
            print("⚠️ Churn rate may need adjustment for optimal model performance")
    
    print("\n🎯 Ready for model training!")
    print("📝 Next steps:")
    print("  1. Run Training.ipynb for model development")
    print("  2. Consider feature selection based on correlations")
    print("  3. Handle class imbalance if needed")
    print("  4. Evaluate multiple algorithms")
else:
    print("❌ Data processing incomplete")
    print("📝 Please check:")
    print("  1. Data file path and format")
    print("  2. Required columns are present")
    print("  3. No critical errors in processing")