# 02 - Exploratory Data Analysis (EDA)

Analyze and visualize the master dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

sns.set_style('whitegrid')
%matplotlib inline

## Load Master Dataset

In [None]:
df = pd.read_csv('../data/processed/final_dataset.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nFirst few rows:")
df.head()

## Basic Statistics

In [None]:
df.describe()

## Missing Values Analysis

In [None]:
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing_Count': missing_data[missing_data > 0],
    'Percentage': missing_percent[missing_data > 0]
})

print("Missing Values:")
print(missing_df if len(missing_df) > 0 else "No missing values!")

# Visualize
if len(missing_df) > 0:
    fig = px.bar(missing_df, x=missing_df.index, y='Missing_Count', title='Missing Values')
    fig.show()

## Distribution Analysis

In [None]:
# Plot distributions of numerical columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
print(f"Numerical columns: {list(numeric_cols)}")

# Create subplots for distributions
fig, axes = plt.subplots(len(numeric_cols), 1, figsize=(10, 4*len(numeric_cols)))
for i, col in enumerate(numeric_cols):
    axes[i].hist(df[col], bins=50, edgecolor='black')
    axes[i].set_title(f'Distribution of {col}')
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## Correlation Analysis

In [None]:
# Correlation heatmap
correlation_matrix = df[numeric_cols].corr()

fig = px.imshow(correlation_matrix,
                 labels=dict(color='Correlation'),
                 x=numeric_cols,
                 y=numeric_cols,
                 color_continuous_scale='RdBu_r')
fig.show()

print("\nHigh Correlations (> 0.7):")
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.7:
            print(f"{correlation_matrix.columns[i]} <-> {correlation_matrix.columns[j]}: {correlation_matrix.iloc[i, j]:.3f}")

## Time Series Trends (if applicable)

In [None]:
# If there's a date column
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date')
    
    # Plot time series
    for col in numeric_cols:
        fig = px.line(df, x='date', y=col, title=f'{col} Over Time')
        fig.show()