# Step 2: Exploratory Data Analysis (EDA)

## Goal
Understand data structure, distributions, relationships, and identify potential issues.

## Important: DO NOT remove outliers yet!
We're just exploring and identifying them. Outlier removal happens in Step 3 (Preprocessing).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Paths
project_root = Path().resolve().parent
data_raw_dir = project_root / "data" / "raw"
reports_dir = project_root / "reports"
reports_dir.mkdir(parents=True, exist_ok=True)

print("EDA Setup Complete!")

## 1. Load BIST Stock Data

Loading the locally saved BIST stock prices data.

In [None]:
# Load BIST stock data
stock_file = data_raw_dir / "bist_stock_prices.csv"

if stock_file.exists():
    df = pd.read_csv(stock_file)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date').reset_index(drop=True)
    
    print(f"‚úÖ Successfully loaded BIST stock data!")
    print(f"   Dataset shape: {df.shape}")
    print(f"   Columns: {df.columns.tolist()}")
    print(f"   Date range: {df['Date'].min()} to {df['Date'].max()}")
    
    if 'Ticker' in df.columns:
        print(f"   Tickers: {df['Ticker'].unique()}")
        print(f"   Number of unique tickers: {df['Ticker'].nunique()}")
    
    print("\nFirst few rows:")
    display(df.head(10))
else:
    print("‚ùå Error: bist_stock_prices.csv not found!")
    print(f"   Expected location: {stock_file}")
    print("   Please run 01_data_collection.ipynb first to download the data.")

## 2. Data Overview & Basic Statistics

In [None]:
# Basic information
print("="*60)
print("DATA OVERVIEW")
print("="*60)

print("\nüìä Dataset Info:")
print(df.info())

print("\nüìà Statistical Summary:")
display(df.describe())

print("\nüìã Sample Data (Last 10 rows):")
display(df.tail(10))

# Check data types
print("\nüîç Data Types:")
print(df.dtypes)

## 3. Missing Values Analysis

Checking for missing or null values in the dataset.

In [None]:
# Missing values count and percentage
missing_count = df.isnull().sum()
missing_pct = (missing_count / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_count.index,
    'Missing Count': missing_count.values,
    'Missing %': missing_pct.values
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("‚ö†Ô∏è  Missing Values Found:")
    print(missing_df)
else:
    print("‚úÖ No missing values found!")

# Visualize missing values
if df.isnull().sum().sum() > 0:
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), yticklabels=False, cbar=True, cmap='viridis')
    plt.title('Missing Values Heatmap', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(reports_dir / 'missing_values_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("\n‚úÖ No missing values to visualize!")

## 4. Time Series Visualization - Price Trends

Visualizing stock price movements over time.

In [None]:
# Time series plot of closing prices
if 'Ticker' in df.columns:
    # Multiple tickers - plot separately
    tickers = df['Ticker'].unique()
    n_tickers = len(tickers)
    
    fig, axes = plt.subplots(n_tickers, 1, figsize=(14, 5*n_tickers))
    if n_tickers == 1:
        axes = [axes]
    
    for idx, ticker in enumerate(tickers):
        ticker_data = df[df['Ticker'] == ticker].copy()
        ticker_data = ticker_data.sort_values('Date')
        
        axes[idx].plot(ticker_data['Date'], ticker_data['Close'], linewidth=1.5, label='Close Price')
        axes[idx].fill_between(ticker_data['Date'], ticker_data['Low'], ticker_data['High'], 
                               alpha=0.3, label='High-Low Range')
        axes[idx].set_title(f'{ticker} - Price Trend Over Time', fontsize=14, fontweight='bold')
        axes[idx].set_xlabel('Date', fontsize=12)
        axes[idx].set_ylabel('Price (TRY)', fontsize=12)
        axes[idx].legend()
        axes[idx].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(reports_dir / 'price_trends.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    # Single ticker or index
    df_sorted = df.sort_values('Date')
    fig, ax = plt.subplots(figsize=(14, 6))
    ax.plot(df_sorted['Date'], df_sorted['Close'], linewidth=1.5, label='Close Price', color='#2E86AB')
    ax.fill_between(df_sorted['Date'], df_sorted['Low'], df_sorted['High'], 
                    alpha=0.3, label='High-Low Range', color='#A23B72')
    ax.set_title('BIST Stock Price Trend Over Time', fontsize=16, fontweight='bold')
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Price (TRY)', fontsize=12)
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(reports_dir / 'price_trend.png', dpi=300, bbox_inches='tight')
    plt.show()

# Volume analysis
if 'Volume' in df.columns:
    fig, ax = plt.subplots(figsize=(14, 6))
    if 'Ticker' in df.columns:
        for ticker in df['Ticker'].unique():
            ticker_data = df[df['Ticker'] == ticker].sort_values('Date')
            ax.plot(ticker_data['Date'], ticker_data['Volume'], label=ticker, alpha=0.7)
    else:
        df_sorted = df.sort_values('Date')
        ax.plot(df_sorted['Date'], df_sorted['Volume'], color='#F18F01', linewidth=1.5)
    
    ax.set_title('Trading Volume Over Time', fontsize=16, fontweight='bold')
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Volume', fontsize=12)
    if 'Ticker' in df.columns:
        ax.legend()
    ax.grid(True, alpha=0.3)
    ax.set_yscale('log')  # Log scale for better visualization
    plt.tight_layout()
    plt.savefig(reports_dir / 'volume_trend.png', dpi=300, bbox_inches='tight')
    plt.show()

## 5. Returns Analysis & Volatility

Calculating daily returns and analyzing price volatility.

In [None]:
# Calculate daily returns
df_analysis = df.copy()

if 'Ticker' in df_analysis.columns:
    df_analysis['Daily_Return'] = df_analysis.groupby('Ticker')['Close'].pct_change()
else:
    df_analysis['Daily_Return'] = df_analysis['Close'].pct_change()

# Calculate volatility (rolling 30-day standard deviation)
if 'Ticker' in df_analysis.columns:
    df_analysis['Volatility_30d'] = df_analysis.groupby('Ticker')['Daily_Return'].rolling(window=30).std().reset_index(0, drop=True)
else:
    df_analysis['Volatility_30d'] = df_analysis['Daily_Return'].rolling(window=30).std()

# Plot daily returns
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Returns time series
if 'Ticker' in df_analysis.columns:
    for ticker in df_analysis['Ticker'].unique():
        ticker_data = df_analysis[df_analysis['Ticker'] == ticker].sort_values('Date')
        axes[0].plot(ticker_data['Date'], ticker_data['Daily_Return'] * 100, 
                    label=ticker, alpha=0.7, linewidth=1)
else:
    df_sorted = df_analysis.sort_values('Date')
    axes[0].plot(df_sorted['Date'], df_sorted['Daily_Return'] * 100, 
                   color='#2E86AB', linewidth=1)

axes[0].axhline(y=0, color='black', linestyle='--', linewidth=0.8)
axes[0].set_title('Daily Returns Over Time', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Date', fontsize=12)
axes[0].set_ylabel('Daily Return (%)', fontsize=12)
if 'Ticker' in df_analysis.columns:
    axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Returns distribution
returns_data = df_analysis['Daily_Return'].dropna()
axes[1].hist(returns_data * 100, bins=100, edgecolor='black', alpha=0.7, color='#A23B72')
axes[1].axvline(x=0, color='red', linestyle='--', linewidth=1.5, label='Zero Return')
axes[1].set_title('Distribution of Daily Returns', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Daily Return (%)', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / 'returns_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

# Returns statistics
print("="*60)
print("RETURNS STATISTICS")
print("="*60)
print(f"\nMean Daily Return: {returns_data.mean()*100:.4f}%")
print(f"Std Dev (Volatility): {returns_data.std()*100:.4f}%")
print(f"Min Daily Return: {returns_data.min()*100:.4f}%")
print(f"Max Daily Return: {returns_data.max()*100:.4f}%")
print(f"Skewness: {returns_data.skew():.4f}")
print(f"Kurtosis: {returns_data.kurtosis():.4f}")

# Volatility plot
if df_analysis['Volatility_30d'].notna().sum() > 0:
    fig, ax = plt.subplots(figsize=(14, 6))
    if 'Ticker' in df_analysis.columns:
        for ticker in df_analysis['Ticker'].unique():
            ticker_data = df_analysis[df_analysis['Ticker'] == ticker].sort_values('Date')
            ax.plot(ticker_data['Date'], ticker_data['Volatility_30d'] * 100, 
                   label=ticker, alpha=0.7)
    else:
        df_sorted = df_analysis.sort_values('Date')
        ax.plot(df_sorted['Date'], df_sorted['Volatility_30d'] * 100, 
               color='#F18F01', linewidth=1.5)
    
    ax.set_title('30-Day Rolling Volatility', fontsize=16, fontweight='bold')
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Volatility (%)', fontsize=12)
    if 'Ticker' in df_analysis.columns:
        ax.legend()
    ax.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(reports_dir / 'volatility_trend.png', dpi=300, bbox_inches='tight')
    plt.show()

## 6. Price Distributions & Outlier Detection

Analyzing price distributions and identifying potential outliers.

In [None]:
# Distribution plots for price-related features
price_cols = ['Open', 'High', 'Low', 'Close']
available_cols = [col for col in price_cols if col in df.columns]

if len(available_cols) > 0:
    n_cols = 2
    n_rows = (len(available_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten()
    
    for i, col in enumerate(available_cols):
        df[col].hist(bins=50, ax=axes[i], edgecolor='black', alpha=0.7, color='#2E86AB')
        axes[i].set_title(f'Distribution of {col}', fontsize=12, fontweight='bold')
        axes[i].set_xlabel(f'{col} (TRY)', fontsize=11)
        axes[i].set_ylabel('Frequency', fontsize=11)
        axes[i].grid(True, alpha=0.3)
    
    # Hide extra subplots
    for i in range(len(available_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.savefig(reports_dir / 'price_distributions.png', dpi=300, bbox_inches='tight')
    plt.show()

# Box plots for outlier detection
numerical_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
available_num_cols = [col for col in numerical_cols if col in df.columns]

if len(available_num_cols) > 0:
    n_cols = 2
    n_rows = (len(available_num_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten()
    
    for i, col in enumerate(available_num_cols):
        sns.boxplot(y=df[col], ax=axes[i], color='#A23B72')
        axes[i].set_title(f'Box Plot: {col} (Outlier Detection)', fontsize=12, fontweight='bold')
        axes[i].set_ylabel(f'{col}', fontsize=11)
        axes[i].grid(True, alpha=0.3)
    
    for i in range(len(available_num_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.savefig(reports_dir / 'outliers_boxplots.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # IQR Method to count outliers
    outlier_summary = []
    for col in available_num_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outlier_summary.append({
            'Feature': col,
            'Outlier Count': len(outliers),
            'Outlier %': (len(outliers) / len(df)) * 100,
            'Lower Bound': lower_bound,
            'Upper Bound': upper_bound
        })
    
    outlier_df = pd.DataFrame(outlier_summary)
    print("\n" + "="*60)
    print("OUTLIER SUMMARY (IQR Method)")
    print("="*60)
    display(outlier_df)

## 7. Correlation Analysis

Analyzing relationships between different price metrics.

## 7. Technical Indicators Analysis

Calculating and visualizing technical indicators for BIST-100 stock analysis.

In [None]:
# Calculate Technical Indicators
df_tech = df.copy()
df_tech = df_tech.sort_values('Date').reset_index(drop=True)

# Ensure we're working with a single ticker or filter for BIST-100
if 'Ticker' in df_tech.columns:
    # Filter for BIST-100 index if available, otherwise use first ticker
    if 'XU100.IS' in df_tech['Ticker'].unique():
        df_tech = df_tech[df_tech['Ticker'] == 'XU100.IS'].copy().reset_index(drop=True)
    else:
        df_tech = df_tech[df_tech['Ticker'] == df_tech['Ticker'].iloc[0]].copy().reset_index(drop=True)

# Moving Averages
df_tech['SMA_20'] = df_tech['Close'].rolling(window=20).mean()
df_tech['SMA_50'] = df_tech['Close'].rolling(window=50).mean()
df_tech['SMA_200'] = df_tech['Close'].rolling(window=200).mean()
df_tech['EMA_12'] = df_tech['Close'].ewm(span=12, adjust=False).mean()
df_tech['EMA_26'] = df_tech['Close'].ewm(span=26, adjust=False).mean()

# MACD (Moving Average Convergence Divergence)
df_tech['MACD'] = df_tech['EMA_12'] - df_tech['EMA_26']
df_tech['MACD_Signal'] = df_tech['MACD'].ewm(span=9, adjust=False).mean()
df_tech['MACD_Histogram'] = df_tech['MACD'] - df_tech['MACD_Signal']

# RSI (Relative Strength Index)
def calculate_rsi(prices, period=14):
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

df_tech['RSI'] = calculate_rsi(df_tech['Close'], period=14)

# Bollinger Bands
df_tech['BB_Middle'] = df_tech['Close'].rolling(window=20).mean()
bb_std = df_tech['Close'].rolling(window=20).std()
df_tech['BB_Upper'] = df_tech['BB_Middle'] + (bb_std * 2)
df_tech['BB_Lower'] = df_tech['BB_Middle'] - (bb_std * 2)
df_tech['BB_Width'] = df_tech['BB_Upper'] - df_tech['BB_Lower']
df_tech['BB_Position'] = (df_tech['Close'] - df_tech['BB_Lower']) / (df_tech['BB_Upper'] - df_tech['BB_Lower'])

# ATR (Average True Range) - Volatility indicator
df_tech['High_Low'] = df_tech['High'] - df_tech['Low']
df_tech['High_Close'] = abs(df_tech['High'] - df_tech['Close'].shift())
df_tech['Low_Close'] = abs(df_tech['Low'] - df_tech['Close'].shift())
df_tech['True_Range'] = df_tech[['High_Low', 'High_Close', 'Low_Close']].max(axis=1)
df_tech['ATR_14'] = df_tech['True_Range'].rolling(window=14).mean()

# Price-based indicators
df_tech['Price_Change'] = df_tech['Close'].diff()
df_tech['Price_Change_Pct'] = df_tech['Close'].pct_change() * 100
df_tech['High_Low_Pct'] = ((df_tech['High'] - df_tech['Low']) / df_tech['Close']) * 100

# Volume indicators
if 'Volume' in df_tech.columns:
    df_tech['Volume_SMA_20'] = df_tech['Volume'].rolling(window=20).mean()
    df_tech['Volume_Ratio'] = df_tech['Volume'] / df_tech['Volume_SMA_20']

print("‚úÖ Technical indicators calculated!")
print(f"\nNew features added:")
tech_features = [col for col in df_tech.columns if col not in df.columns]
print(f"   - {len(tech_features)} technical indicators")
print(f"   - Features: {', '.join(tech_features[:10])}...")

In [None]:
# Visualize Technical Indicators
fig, axes = plt.subplots(4, 1, figsize=(16, 14))

# 1. Price with Moving Averages
ax1 = axes[0]
ax1.plot(df_tech['Date'], df_tech['Close'], label='Close Price', linewidth=2, color='#2E86AB')
ax1.plot(df_tech['Date'], df_tech['SMA_20'], label='SMA 20', linewidth=1.5, alpha=0.7, color='#F18F01')
ax1.plot(df_tech['Date'], df_tech['SMA_50'], label='SMA 50', linewidth=1.5, alpha=0.7, color='#A23B72')
ax1.plot(df_tech['Date'], df_tech['SMA_200'], label='SMA 200', linewidth=1.5, alpha=0.7, color='#06A77D')
ax1.fill_between(df_tech['Date'], df_tech['BB_Upper'], df_tech['BB_Lower'], 
                 alpha=0.2, label='Bollinger Bands', color='gray')
ax1.set_title('BIST-100: Price with Moving Averages & Bollinger Bands', fontsize=14, fontweight='bold')
ax1.set_ylabel('Price (TRY)', fontsize=12)
ax1.legend(loc='best', fontsize=10)
ax1.grid(True, alpha=0.3)

# 2. MACD
ax2 = axes[1]
ax2.plot(df_tech['Date'], df_tech['MACD'], label='MACD', linewidth=1.5, color='#2E86AB')
ax2.plot(df_tech['Date'], df_tech['MACD_Signal'], label='Signal Line', linewidth=1.5, color='#F18F01')
ax2.bar(df_tech['Date'], df_tech['MACD_Histogram'], label='Histogram', alpha=0.6, color='#A23B72')
ax2.axhline(y=0, color='black', linestyle='--', linewidth=0.8)
ax2.set_title('MACD (Moving Average Convergence Divergence)', fontsize=14, fontweight='bold')
ax2.set_ylabel('MACD', fontsize=12)
ax2.legend(loc='best', fontsize=10)
ax2.grid(True, alpha=0.3)

# 3. RSI
ax3 = axes[2]
ax3.plot(df_tech['Date'], df_tech['RSI'], label='RSI', linewidth=1.5, color='#2E86AB')
ax3.axhline(y=70, color='red', linestyle='--', linewidth=1, label='Overbought (70)')
ax3.axhline(y=30, color='green', linestyle='--', linewidth=1, label='Oversold (30)')
ax3.fill_between(df_tech['Date'], 70, 100, alpha=0.2, color='red')
ax3.fill_between(df_tech['Date'], 0, 30, alpha=0.2, color='green')
ax3.set_title('RSI (Relative Strength Index)', fontsize=14, fontweight='bold')
ax3.set_ylabel('RSI', fontsize=12)
ax3.set_ylim(0, 100)
ax3.legend(loc='best', fontsize=10)
ax3.grid(True, alpha=0.3)

# 4. ATR (Volatility)
ax4 = axes[3]
ax4.plot(df_tech['Date'], df_tech['ATR_14'], label='ATR (14-day)', linewidth=1.5, color='#A23B72')
ax4.set_title('ATR - Average True Range (Volatility Indicator)', fontsize=14, fontweight='bold')
ax4.set_xlabel('Date', fontsize=12)
ax4.set_ylabel('ATR', fontsize=12)
ax4.legend(loc='best', fontsize=10)
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(reports_dir / 'technical_indicators.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Technical indicators visualization saved!")

## 8. Summary & Key Insights

Documenting findings from the exploratory data analysis.

In [None]:
# Summary of key findings
print("="*60)
print("EDA SUMMARY & KEY INSIGHTS")
print("="*60)

print(f"\n1. DATA STRUCTURE:")
print(f"   - Total records: {len(df):,}")
print(f"   - Date range: {df['Date'].min().date()} to {df['Date'].max().date()}")
print(f"   - Number of trading days: {df['Date'].nunique():,}")
if 'Ticker' in df.columns:
    print(f"   - Number of tickers: {df['Ticker'].nunique()}")

print(f"\n2. MISSING VALUES:")
missing_total = df.isnull().sum().sum()
if missing_total == 0:
    print(f"   ‚úÖ No missing values found")
else:
    print(f"   ‚ö†Ô∏è  Total missing values: {missing_total}")

# Calculate returns for summary
try:
    temp_df = df.copy()
    if 'Ticker' in temp_df.columns:
        temp_df['Daily_Return'] = temp_df.groupby('Ticker')['Close'].pct_change()
    else:
        temp_df['Daily_Return'] = temp_df['Close'].pct_change()
    
    returns_data = temp_df['Daily_Return'].dropna()
    if len(returns_data) > 0:
        print(f"\n3. RETURNS CHARACTERISTICS:")
        print(f"   - Mean daily return: {returns_data.mean()*100:.4f}%")
        print(f"   - Volatility (std dev): {returns_data.std()*100:.4f}%")
        print(f"   - Skewness: {returns_data.skew():.4f}")
        print(f"   - Kurtosis: {returns_data.kurtosis():.4f}")
except Exception as e:
    print(f"\n3. RETURNS CHARACTERISTICS:")
    print(f"   ‚ö†Ô∏è  Could not calculate returns: {str(e)}")

print(f"\n4. PRICE RANGES:")
if 'Close' in df.columns:
    print(f"   - Min Close Price: {df['Close'].min():.2f} TRY")
    print(f"   - Max Close Price: {df['Close'].max():.2f} TRY")
    print(f"   - Mean Close Price: {df['Close'].mean():.2f} TRY")

if 'Volume' in df.columns:
    print(f"\n5. VOLUME STATISTICS:")
    print(f"   - Mean Volume: {df['Volume'].mean():,.0f}")
    print(f"   - Max Volume: {df['Volume'].max():,.0f}")
    print(f"   - Min Volume: {df['Volume'].min():,.0f}")

print(f"\n6. DATA QUALITY:")
print(f"   ‚úÖ Data appears to be clean and ready for preprocessing")
print(f"   üìä All visualizations saved to: {reports_dir}")

print("\n" + "="*60)
print("‚úÖ EDA COMPLETE!")
print("="*60)
print("\nüìã Next Steps:")
print("   1. Review the visualizations and insights above")
print("   2. Proceed to data preprocessing (if needed)")
print("   3. Consider feature engineering for ML models")
print("\nüí° All plots have been saved to the reports/ folder")

In [None]:
# Correlation matrix for price features
price_features = ['Open', 'High', 'Low', 'Close', 'Volume']
available_features = [col for col in price_features if col in df.columns]

if len(available_features) > 1:
    corr_matrix = df[available_features].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, fmt='.3f', cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8}, 
                vmin=-1, vmax=1)
    plt.title('Correlation Matrix of Price Features', fontsize=16, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.savefig(reports_dir / 'correlation_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\n" + "="*60)
    print("CORRELATION MATRIX")
    print("="*60)
    display(corr_matrix)
else:
    print("‚ö†Ô∏è  Not enough features for correlation analysis")