# Exploratory Data Analysis
## Discovering the Story Behind Our Data

**Objective**: Perform initial exploration of raw data to understand its characteristics, identify quality issues, and set the foundation for our data storytelling narrative.

---

## 1. Setup and Imports

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Statistical analysis
from scipy import stats

# Custom modules
import sys
import os
sys.path.append('../src')
from utils.data_utils import load_data, get_data_quality_report
from visualization.storytelling_viz import StorytellingVisualizer

# Configuration
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ Libraries imported successfully!")

## 2. Data Loading

Load your raw dataset here. Replace the file path with your actual data file.

In [None]:
# Load raw data
# TODO: Replace with your actual data file path
# raw_data = load_data('../data/raw/your_dataset.csv')

# For demonstration, create sample data
np.random.seed(42)
n_samples = 1000

raw_data = pd.DataFrame({
    'feature_1': np.random.normal(100, 15, n_samples),
    'feature_2': np.random.exponential(2, n_samples),
    'feature_3': np.random.choice(['A', 'B', 'C', 'D'], n_samples),
    'feature_4': np.random.uniform(0, 1, n_samples),
    'target': np.random.choice([0, 1], n_samples)
})

# Introduce some data quality issues for demonstration
# Missing values
missing_indices = np.random.choice(n_samples, size=int(0.1 * n_samples), replace=False)
raw_data.loc[missing_indices, 'feature_1'] = np.nan

# Outliers
outlier_indices = np.random.choice(n_samples, size=int(0.05 * n_samples), replace=False)
raw_data.loc[outlier_indices, 'feature_2'] = raw_data['feature_2'].quantile(0.95) * 5

# Duplicates
raw_data = pd.concat([raw_data, raw_data.iloc[:50]], ignore_index=True)

print(f"📊 Data loaded successfully!")
print(f"Shape: {raw_data.shape}")
print(f"Memory usage: {raw_data.memory_usage(deep=True).sum() / 1024:.2f} KB")

## 3. Initial Data Overview

In [None]:
# Basic information about the dataset
print("=== DATASET INFO ===")
print(f"Shape: {raw_data.shape}")
print(f"Columns: {list(raw_data.columns)}")
print("\n=== DATA TYPES ===")
print(raw_data.dtypes)
print("\n=== FIRST 5 ROWS ===")
display(raw_data.head())

In [None]:
# Statistical summary
print("=== STATISTICAL SUMMARY ===")
display(raw_data.describe(include='all'))

## 4. Data Quality Assessment

This is where we start building our story - identifying the problems with raw data.

In [None]:
# Generate comprehensive data quality report
quality_report = get_data_quality_report(raw_data)

print("=== DATA QUALITY REPORT ===")
print(f"📏 Shape: {quality_report['shape']}")
print(f"🔄 Duplicates: {quality_report['duplicates']}")
print(f"💾 Memory Usage: {quality_report['memory_usage'] / 1024:.2f} KB")
print("\n❌ Missing Values:")
for col, missing in quality_report['missing_values'].items():
    if missing > 0:
        percentage = quality_report['missing_percentage'][col]
        print(f"  {col}: {missing} ({percentage:.1f}%)")

print("\n📊 Column Types:")
print(f"  Numeric: {len(quality_report['numeric_columns'])}")
print(f"  Categorical: {len(quality_report['categorical_columns'])}")
print(f"  Datetime: {len(quality_report['datetime_columns'])}")

## 5. Visual Data Quality Assessment

In [None]:
# Create visualizations to highlight data quality issues
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Raw Data Quality Issues - The Story Begins', fontsize=16, fontweight='bold')

# 1. Missing values heatmap
sns.heatmap(raw_data.isnull(), cbar=True, ax=axes[0, 0], cmap='Reds')
axes[0, 0].set_title('Missing Values Pattern')
axes[0, 0].set_xlabel('Columns')
axes[0, 0].set_ylabel('Rows (sample)')

# 2. Distribution of numeric features
numeric_cols = raw_data.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    raw_data[numeric_cols[0]].hist(bins=30, ax=axes[0, 1], alpha=0.7, color='red', edgecolor='black')
    axes[0, 1].set_title(f'Distribution of {numeric_cols[0]} (with outliers)')
    axes[0, 1].set_xlabel('Value')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].grid(True, alpha=0.3)

# 3. Categorical distribution
cat_cols = raw_data.select_dtypes(include=['object']).columns
if len(cat_cols) > 0:
    raw_data[cat_cols[0]].value_counts().plot(kind='bar', ax=axes[1, 0], color='orange', alpha=0.7)
    axes[1, 0].set_title(f'Distribution of {cat_cols[0]}')
    axes[1, 0].set_xlabel('Categories')
    axes[1, 0].set_ylabel('Count')
    axes[1, 0].tick_params(axis='x', rotation=45)

# 4. Correlation heatmap (numeric features only)
numeric_data = raw_data.select_dtypes(include=[np.number])
if numeric_data.shape[1] > 1:
    correlation_matrix = numeric_data.corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1, 1])
    axes[1, 1].set_title('Feature Correlations')

plt.tight_layout()
plt.show()

# Save the figure
plt.savefig('../outputs/figures/raw_data_quality_assessment.png', dpi=300, bbox_inches='tight')
print("📈 Data quality visualization saved!")

## 6. Outlier Detection and Analysis

In [None]:
# Detect outliers in numeric columns
def detect_outliers_iqr(series, factor=1.5):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR
    return (series < lower_bound) | (series > upper_bound)

print("=== OUTLIER ANALYSIS ===")
for col in numeric_cols:
    outliers = detect_outliers_iqr(raw_data[col].dropna())
    outlier_count = outliers.sum()
    outlier_percentage = (outlier_count / len(raw_data)) * 100
    print(f"{col}: {outlier_count} outliers ({outlier_percentage:.1f}%)")
    
    if outlier_count > 0:
        # Visualize outliers
        plt.figure(figsize=(12, 4))
        
        plt.subplot(1, 2, 1)
        plt.boxplot(raw_data[col].dropna())
        plt.title(f'Box Plot: {col}')
        plt.ylabel('Value')
        
        plt.subplot(1, 2, 2)
        plt.scatter(range(len(raw_data)), raw_data[col], alpha=0.6, c='blue', label='Normal')
        outlier_mask = detect_outliers_iqr(raw_data[col].fillna(raw_data[col].median()))
        plt.scatter(np.where(outlier_mask)[0], raw_data[col][outlier_mask], 
                   c='red', alpha=0.8, label='Outliers')
        plt.title(f'Outlier Detection: {col}')
        plt.xlabel('Index')
        plt.ylabel('Value')
        plt.legend()
        
        plt.tight_layout()
        plt.show()
        
        break  # Show only first column with outliers for brevity

## 7. Data Story Setup - Key Insights

Based on our exploration, let's document the key insights that will drive our data story.

In [None]:
# Summarize key findings for our data story
story_insights = {
    'dataset_shape': raw_data.shape,
    'missing_data_issues': sum(1 for x in quality_report['missing_values'].values() if x > 0),
    'duplicate_rows': quality_report['duplicates'],
    'outlier_columns': [],
    'data_types_mixed': len(quality_report['numeric_columns']) + len(quality_report['categorical_columns'])
}

# Count outlier columns
for col in numeric_cols:
    outliers = detect_outliers_iqr(raw_data[col].dropna())
    if outliers.sum() > 0:
        story_insights['outlier_columns'].append(col)

print("=== DATA STORY INSIGHTS ===")
print(f"📊 Dataset contains {story_insights['dataset_shape'][0]:,} rows and {story_insights['dataset_shape'][1]} columns")
print(f"❌ {story_insights['missing_data_issues']} columns have missing values")
print(f"🔄 {story_insights['duplicate_rows']} duplicate rows found")
print(f"⚠️  {len(story_insights['outlier_columns'])} columns contain outliers")
print(f"🔢 Mixed data types: {story_insights['data_types_mixed']} total columns")

print("\n=== STORY NARRATIVE POINTS ===")
print("1. Raw data contains significant quality issues")
print("2. Missing values may bias analysis results")
print("3. Outliers could skew model performance")
print("4. Duplicates inflate dataset size artificially")
print("5. Data preparation is essential for reliable insights")

# Save insights for later use
import json
with open('../outputs/reports/raw_data_insights.json', 'w') as f:
    json.dump(story_insights, f, indent=2, default=str)

print("\n💾 Insights saved for storytelling!")

## 8. Next Steps

Based on our exploration, the next notebook should focus on:

1. **Data Cleaning**: Handle missing values, remove duplicates
2. **Outlier Treatment**: Apply appropriate outlier handling strategies
3. **Feature Engineering**: Create new features, encode categorical variables
4. **Data Validation**: Ensure data quality improvements

This will set up the perfect comparison for our data storytelling narrative!

In [None]:
# Save raw data for use in next notebook
raw_data.to_csv('../data/raw/sample_raw_data.csv', index=False)
print("✅ Raw data saved for processing pipeline!")
print("\n🚀 Ready to move to notebook 02_data_preparation.ipynb")