# Exploratory Data Analysis (EDA)
## Loan Approval Prediction Dataset

This notebook performs comprehensive EDA on the loan dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

In [None]:
# Generate sample data
from main import generate_sample_data
df = generate_sample_data(1000)

print(f"Dataset Shape: {df.shape}")
df.head()

## 2. Basic Information

In [None]:
# Dataset info
print("Dataset Info:")
df.info()

print("\nBasic Statistics:")
df.describe()

## 3. Missing Values Analysis

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing': missing, 'Percentage': missing_pct})
missing_df = missing_df[missing_df['Missing'] > 0].sort_values('Missing', ascending=False)

print("Missing Values:")
print(missing_df)

# Visualize
if len(missing_df) > 0:
    plt.figure(figsize=(10, 5))
    sns.barplot(x=missing_df.index, y=missing_df['Percentage'])
    plt.title('Missing Values Percentage')
    plt.ylabel('Percentage')
    plt.xticks(rotation=45)
    plt.show()

## 4. Target Variable Distribution

In [None]:
# Target distribution
print("Loan Status Distribution:")
print(df['Loan_Status'].value_counts())
print("\nPercentage:")
print(df['Loan_Status'].value_counts(normalize=True) * 100)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
sns.countplot(data=df, x='Loan_Status', ax=axes[0])
axes[0].set_title('Loan Status Distribution')
axes[0].set_xlabel('Loan Status')
axes[0].set_ylabel('Count')

# Pie chart
df['Loan_Status'].value_counts().plot.pie(autopct='%1.1f%%', ax=axes[1])
axes[1].set_title('Loan Status Proportion')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()

## 5. Categorical Features Analysis

In [None]:
# Categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols.remove('Loan_Status')

# Plot distributions
fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.ravel()

for idx, col in enumerate(cat_cols):
    sns.countplot(data=df, x=col, hue='Loan_Status', ax=axes[idx])
    axes[idx].set_title(f'{col} vs Loan Status')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 6. Numerical Features Analysis

In [None]:
# Numerical columns
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Distribution plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(num_cols[:4]):
    df[col].hist(bins=30, ax=axes[idx], edgecolor='black')
    axes[idx].set_title(f'{col} Distribution')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 7. Correlation Analysis

In [None]:
# Encode categorical for correlation
df_encoded = df.copy()
for col in df_encoded.select_dtypes(include=['object']).columns:
    df_encoded[col] = pd.Categorical(df_encoded[col]).codes

# Correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(df_encoded.corr(), annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

## 8. Key Insights

### Summary:
1. **Target Imbalance**: Dataset shows class imbalance (need SMOTE/resampling)
2. **Credit History**: Strong predictor of loan approval
3. **Income**: Higher income correlates with approval
4. **Missing Values**: Minimal missing data (~5%)
5. **Feature Relationships**: Some features show correlation

### Next Steps:
- Handle imbalanced data
- Feature engineering
- Model training with multiple algorithms