# Exploratory Data Analysis (EDA)

This notebook contains the exploratory data analysis for the credit risk model project.

## 1. Data Loading and Basic Statistics

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from src.data.data_loader import DataLoader
from src.data.data_cleaner import DataCleaner

# Configure plotting
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = (12, 8)

# Load and clean data
data_loader = DataLoader('data/raw/transactions.csv')
df = data_loader.load_data()

data_cleaner = DataCleaner()
df_clean = data_cleaner.clean_data(df)

## 2. Data Overview

In [None]:
# Display basic information
print(f'Number of rows: {len(df_clean)}')
print(f'Number of columns: {len(df_clean.columns)}')

# Display data types
print('
Data types:
')
print(df_clean.dtypes)

# Display summary statistics
print('
Summary statistics:
')
display(df_clean.describe())

## 3. Distribution Analysis

In [None]:
# Plot distribution of key numerical features
numerical_features = ['Amount', 'Value', 'TransactionHour', 'TransactionDay', 'TransactionMonth']

fig, axes = plt.subplots(2, 3, figsize=(20, 10))
axes = axes.flatten()

for i, feature in enumerate(numerical_features):
    sns.histplot(df_clean[feature], kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {feature}')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Correlation Analysis

In [None]:
# Calculate correlation matrix
corr_matrix = df_clean.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

## 5. Missing Values Analysis

In [None]:
# Calculate missing values
missing_values = df_clean.isnull().sum() / len(df_clean) * 100
missing_values = missing_values[missing_values > 0]

# Plot missing values
plt.figure(figsize=(10, 6))
missing_values.plot(kind='bar')
plt.title('Percentage of Missing Values')
plt.ylabel('Percentage')
plt.grid(True, alpha=0.3)
plt.show()

## 6. Key Insights

### Top 3-5 Insights:

1. **Transaction Amount Distribution:**
   - Highly skewed distribution with many small transactions
   - Potential need for log transformation or binning

2. **Time Patterns:**
   - Clear patterns in transaction timing
   - Potential seasonality in transaction volumes

3. **Correlation Structure:**
   - Strong relationships between monetary metrics
   - Important to consider multicollinearity in modeling

4. **Missing Data:**
   - Certain features have significant missing values
   - Need to implement appropriate imputation strategies

5. **Customer Behavior:**
   - Varied transaction frequencies and amounts
   - Potential to identify different customer segments