In [None]:
# Credit Risk Dataset - Exploratory Data Analysis
# =============================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# 1. Load and Initial Inspection
# ==============================

df = pd.read_csv('data\credit_risk_dataset.csv')
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

print("\nDataset Info:")
df.info()

print("\nBasic Statistics:")
display(df.describe())

print("\nMissing Values:")
missing_data = df.isnull().sum()
print(missing_data[missing_data > 0])

# 2. Target Variable Analysis
# ==========================

print("Target Variable - loan_status:")
print(df['loan_status'].value_counts())
print(f"Default Rate: {df['loan_status'].mean():.2%}")

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
df['loan_status'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Loan Status Distribution')
plt.xlabel('Loan Status (0=No Default, 1=Default)')
plt.ylabel('Count')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
df['loan_status'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['lightblue', 'lightcoral'])
plt.title('Loan Status Proportion')

plt.tight_layout()
plt.show()

# 3. Numerical Features Analysis
# ==============================

numerical_features = ['person_age', 'person_income', 'person_emp_length', 
                     'loan_amnt', 'loan_int_rate', 'loan_percent_income',
                     'cb_person_cred_hist_length']

print("Numerical Features Distribution:")

# Distribution plots
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

for i, col in enumerate(numerical_features):
    if col in df.columns:
        df[col].hist(bins=30, ax=axes[i], alpha=0.7, color='skyblue')
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')

# Remove empty subplots
for i in range(len(numerical_features), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()

# Boxplots by loan status
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

for i, col in enumerate(numerical_features):
    if col in df.columns:
        df.boxplot(column=col, by='loan_status', ax=axes[i], grid=False)
        axes[i].set_title(f'{col} by Loan Status')
        axes[i].set_xlabel('Loan Status')

# Remove empty subplots
for i in range(len(numerical_features), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.suptitle('')  # Remove automatic title
plt.show()

# 4. Categorical Features Analysis
# ================================

categorical_features = ['person_home_ownership', 'loan_intent', 'loan_grade', 
                       'cb_person_default_on_file']

print("Categorical Features Analysis:")

for col in categorical_features:
    print(f"\n{col}:")
    print(df[col].value_counts())
    
    # Plot categorical features
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    df[col].value_counts().plot(kind='bar', color='lightseagreen')
    plt.title(f'{col} Distribution')
    plt.xticks(rotation=45)
    
    plt.subplot(1, 2, 2)
    # Default rate by category
    default_rates = df.groupby(col)['loan_status'].mean().sort_values(ascending=False)
    default_rates.plot(kind='bar', color='coral')
    plt.title(f'Default Rate by {col}')
    plt.xticks(rotation=45)
    plt.ylabel('Default Rate')
    
    plt.tight_layout()
    plt.show()

# 5. Correlation Analysis
# =======================

# Select only numerical columns for correlation
numerical_df = df[numerical_features + ['loan_status']]

plt.figure(figsize=(12, 8))
correlation_matrix = numerical_df.corr()

mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', 
            center=0, square=True, linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

# 6. Feature-Target Relationships
# ==============================

print("Feature-Target Relationships:")

# Person income vs loan amount colored by loan status
plt.figure(figsize=(10, 6))
scatter = plt.scatter(df['person_income'], df['loan_amnt'], 
                     c=df['loan_status'], alpha=0.6, cmap='viridis')
plt.colorbar(scatter, label='Loan Status (0=No Default, 1=Default)')
plt.xlabel('Person Income')
plt.ylabel('Loan Amount')
plt.title('Income vs Loan Amount by Loan Status')
plt.show()

# Loan percent income distribution by loan status
plt.figure(figsize=(10, 6))
sns.boxplot(x='loan_status', y='loan_percent_income', data=df)
plt.title('Loan Percent Income by Loan Status')
plt.xlabel('Loan Status (0=No Default, 1=Default)')
plt.ylabel('Loan Percent Income')
plt.show()

# 7. Advanced Analysis
# ===================

# Age groups analysis
df['age_group'] = pd.cut(df['person_age'], bins=[0, 25, 35, 45, 55, 100], 
                        labels=['18-25', '26-35', '36-45', '46-55', '55+'])

plt.figure(figsize=(10, 6))
default_by_age = df.groupby('age_group')['loan_status'].mean()
default_by_age.plot(kind='bar', color='lightcoral')
plt.title('Default Rate by Age Group')
plt.ylabel('Default Rate')
plt.xlabel('Age Group')
plt.xticks(rotation=45)
plt.show()

# Income groups analysis
df['income_group'] = pd.qcut(df['person_income'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

plt.figure(figsize=(10, 6))
default_by_income = df.groupby('income_group')['loan_status'].mean()
default_by_income.plot(kind='bar', color='lightseagreen')
plt.title('Default Rate by Income Group')
plt.ylabel('Default Rate')
plt.xlabel('Income Group')
plt.xticks(rotation=45)
plt.show()

# 8. Multivariate Analysis
# ========================

# Pairplot of key features
key_features = ['person_age', 'person_income', 'loan_amnt', 'loan_int_rate', 'loan_status']
sns.pairplot(df[key_features], hue='loan_status', palette='viridis', diag_kind='kde')
plt.suptitle('Pairplot of Key Features by Loan Status', y=1.02)
plt.show()

# 9. Missing Values Analysis
# ==========================

print("Detailed Missing Values Analysis:")
missing_summary = pd.DataFrame({
    'Missing Count': df.isnull().sum(),
    'Missing Percentage': (df.isnull().sum() / len(df)) * 100
})
missing_summary = missing_summary[missing_summary['Missing Count'] > 0]
display(missing_summary)

# 10. Key Insights Summary
# ========================

print("="*50)
print("KEY INSIGHTS SUMMARY")
print("="*50)

print(f"1. Dataset size: {df.shape[0]:,} records, {df.shape[1]} features")
print(f"2. Overall default rate: {df['loan_status'].mean():.2%}")
print(f"3. Missing values: {df.isnull().sum().sum()} total missing values")

# Top risk factors
print("\n4. Top Risk Factors:")
print(f"   - Highest default rate by home ownership: {df.groupby('person_home_ownership')['loan_status'].mean().idxmax()}")
print(f"   - Highest default rate by loan intent: {df.groupby('loan_intent')['loan_status'].mean().idxmax()}")
print(f"   - Highest default rate by loan grade: {df.groupby('loan_grade')['loan_status'].mean().idxmax()}")

# Correlation with target
corr_with_target = numerical_df.corr()['loan_status'].sort_values(ascending=False)
print(f"\n5. Top correlations with loan_status:")
for feature, corr in corr_with_target[1:4].items():  # Skip loan_status itself
    print(f"   - {feature}: {corr:.3f}")

print("\n6. Data Quality Recommendations:")
print("   - Handle missing values in person_emp_length and loan_int_rate")
print("   - Consider feature engineering for age and income groups")
print("   - Monitor high-risk categories (certain home ownership types, loan intents)")

# Clean up temporary columns
df = df.drop(['age_group', 'income_group'], axis=1, errors='ignore')