# Loan Default Prediction - Exploratory Data Analysis

This notebook performs an exploratory data analysis (EDA) of the Loan Default dataset, examining features, distributions, relationships, and preparing insights for model development.

## 1. Import Libraries

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

# Statistical libraries
from scipy import stats
from scipy.stats import chi2_contingency

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)
warnings.filterwarnings('ignore')

# Set plot style and size
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['axes.labelsize'] = 12
sns.set_palette('viridis')

## 2. Load and Examine the Dataset

### 2.1 Load the Dataset and Display Basic Information

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# Define the expected dataset path
data_path = '../data/Loan_Default.csv'
alternative_path = '../data/loan_default.csv'

# Check if file exists (case-sensitive check)
file_exists = os.path.exists(data_path) or os.path.exists(alternative_path)

# Set the actual path based on what exists
actual_path = data_path if os.path.exists(data_path) else alternative_path if os.path.exists(alternative_path) else None

if actual_path:
    print(f"✅ Dataset found at: {os.path.abspath(actual_path)}")
    
    # Get file size in MB
    file_size_bytes = os.path.getsize(actual_path)
    file_size_mb = file_size_bytes / (1024 * 1024)
    print(f"File size: {file_size_mb:.2f} MB")
    
    # Try to load the dataset with different encodings if necessary
    encodings_to_try = ['utf-8', 'latin1', 'ISO-8859-1', 'cp1252']
    df = None
    
    for encoding in encodings_to_try:
        try:
            df = pd.read_csv(actual_path, encoding=encoding)
            print(f"Successfully loaded dataset with encoding: {encoding}")
            break
        except UnicodeDecodeError:
            print(f"Failed to load with encoding: {encoding}")
        except Exception as e:
            print(f"Error loading dataset: {str(e)}")
            break
    
    if df is not None:
        # Display basic information
        print(f"\nDataset shape: {df.shape[0]} rows and {df.shape[1]} columns")
        print("\nFirst 5 rows:")
        display(df.head())
        
        print("\nColumn list:")
        for i, col in enumerate(df.columns, 1):
            print(f"{i}. {col}")
    else:
        print("❌ Failed to load the dataset with any encoding.")
else:
    print("❌ Dataset not found.")
    print("To download the dataset manually from Kaggle:")
    print("1. Visit https://www.kaggle.com/datasets/yasserh/loan-default-dataset")
    print("2. Click 'Download' button")
    print("3. Save the file to the 'data' folder as 'Loan_Default.csv'")
    
    print("\nTo download using Kaggle CLI:")
    print("1. Install Kaggle CLI: pip install kaggle")
    print("2. Set up your Kaggle API token in ~/.kaggle/kaggle.json")
    print("3. Run the following command:")
    print("   kaggle datasets download -d yasserh/loan-default-dataset --path ../data")
    print("4. Unzip the downloaded file:")
    print("   Expand-Archive -Path ../data/loan-default-dataset.zip -DestinationPath ../data")
    print("5. Verify the file exists")

In [None]:
if df is not None:
    # Generate a summary report
    print("\n" + "="*50)
    print("DATASET SUMMARY")
    print("="*50)
    
    # Count data types
    type_counts = df.dtypes.value_counts()
    print(f"Data types in dataset:")
    for dtype, count in type_counts.items():
        print(f"- {dtype}: {count} columns")
    
    # Memory usage
    memory_usage = df.memory_usage(deep=True).sum() / (1024 * 1024)
    print(f"\nMemory usage: {memory_usage:.2f} MB")
    
    # Missing values overview
    missing_values = df.isnull().sum()
    missing_cols = missing_values[missing_values > 0]
    if len(missing_cols) > 0:
        print(f"\nColumns with missing values: {len(missing_cols)}")
        for col, count in missing_cols.items():
            print(f"- {col}: {count} missing values ({count/len(df)*100:.2f}%)")
    else:
        print("\nNo missing values found in the dataset.")
        
    # Data summary
    print("\n" + "="*50)
    print("COMPLETE SUMMARY REPORT")
    print("="*50)
    print(f"File path: {os.path.abspath(actual_path)}")
    print(f"File size: {file_size_mb:.2f} MB")
    print(f"Number of rows: {df.shape[0]}")
    print(f"Number of columns: {df.shape[1]}")
    print(f"Column list: {', '.join(df.columns)}")

In [None]:
# Load the dataset
data_path = '../data/Loan_Default.csv'

try:
    # Load the dataset
    df = pd.read_csv(data_path)
    print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")
    
    # Display first 10 rows
    print("\nFirst 10 rows of the dataset:")
    display(df.head(10))
except Exception as e:
    print(f"Error loading the dataset: {str(e)}")

In [None]:
# Check data types and null counts
print("Data types and null counts:")
buffer = []
for col in df.columns:
    dtype = df[col].dtype
    nulls = df[col].isnull().sum()
    null_percent = nulls / len(df) * 100
    buffer.append({
        'Column': col,
        'Data Type': dtype,
        'Null Count': nulls,
        'Null %': f"{null_percent:.2f}%"
    })

dtypes_df = pd.DataFrame(buffer)
display(dtypes_df)

### 2.2 Target Variable Analysis

In [None]:
# Identify target variable - in this dataset, it's 'Status' column
# Check if 'default' or similar columns exist
target_candidates = ['default', 'loan_default', 'Status']
target_col = None

for col in target_candidates:
    if col in df.columns:
        target_col = col
        break

if target_col:
    print(f"Target variable identified: '{target_col}'")
    
    # Count distribution
    target_counts = df[target_col].value_counts()
    target_percents = df[target_col].value_counts(normalize=True) * 100
    
    # Create a DataFrame for better display
    target_df = pd.DataFrame({
        'Count': target_counts,
        'Percentage': target_percents
    })
    
    print("\nClass distribution:")
    display(target_df)
    
    # Visualize distribution
    plt.figure(figsize=(10, 6))
    ax = sns.countplot(x=target_col, data=df, palette=['#66b3ff', '#ff9999'])
    plt.title(f'Distribution of {target_col}', fontsize=16)
    plt.xlabel(target_col, fontsize=12)
    plt.ylabel('Count', fontsize=12)
    
    # Add count and percentage labels above bars
    for i, p in enumerate(target_percents):
        plt.text(i, target_counts.values[i] + len(df)*0.01, 
                 f"{target_counts.values[i]}\n({p:.1f}%)", 
                 ha='center', fontsize=12)
    
    plt.tight_layout()
    plt.show()
else:
    print("Target variable not found in dataset.")

### 2.3 Summary Statistics for Numerical Features

In [None]:
# Get numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove ID columns and target if it's numerical
exclude_cols = ['ID', 'id', target_col] if target_col else ['ID', 'id']
numerical_cols = [col for col in numerical_cols if col not in exclude_cols]

if numerical_cols:
    # Calculate summary statistics
    summary_stats = df[numerical_cols].describe().T
    
    # Add median to the summary
    summary_stats['median'] = df[numerical_cols].median()
    
    # Reorder columns for better readability
    summary_stats = summary_stats[['count', 'mean', 'median', 'std', 'min', '25%', '50%', '75%', 'max']]
    
    print("Summary statistics for numerical features:")
    display(summary_stats)
else:
    print("No numerical columns found in the dataset.")

### 2.4 Detect Suspicious Placeholder Values

In [None]:
# Define potential suspicious values to check
suspicious_values = [-1, -99, -999, -9999, 9999, 99999, 999999]

# Store columns with suspicious values
suspicious_columns = {}

# Check numerical columns for suspicious values
for col in numerical_cols:
    # Get unique values
    unique_vals = df[col].unique()
    
    # Check if any suspicious values are present in high frequency
    for val in suspicious_values:
        if val in unique_vals:
            # Calculate percentage of this value
            val_percentage = (df[col] == val).mean() * 100
            
            # Only consider it suspicious if it appears with some frequency (>0.5%)
            if val_percentage > 0.5:
                if col not in suspicious_columns:
                    suspicious_columns[col] = []
                suspicious_columns[col].append((val, val_percentage))

# Display results
if suspicious_columns:
    print("Suspicious placeholder values detected:")
    for col, values in suspicious_columns.items():
        print(f"\n{col}:")
        for val, percentage in values:
            print(f"  - Value {val} appears {percentage:.2f}% of the time")
else:
    print("No suspicious placeholder values detected in the numerical columns.")

### 2.5 Distribution of Key Numerical Features

In [None]:
# Key features to look for
key_feature_names = ['income', 'loan_amount', 'credit_score', 'Credit_Score', 'loan_amt', 'interest_rate']

# Find matching columns in our dataset
key_features = [col for col in numerical_cols if col.lower() in [name.lower() for name in key_feature_names]]

# If no matches, take first few numerical columns
if not key_features and numerical_cols:
    key_features = numerical_cols[:3]
    print(f"No exact matches for key features found. Using first 3 numerical columns: {key_features}")
else:
    print(f"Found key numerical features: {key_features}")

# Plot histograms with KDE for key features
if key_features:
    plt.figure(figsize=(18, 5 * ((len(key_features) + 2) // 3)))
    for i, col in enumerate(key_features, 1):
        plt.subplot((len(key_features) + 2) // 3, 3, i)
        sns.histplot(df[col].dropna(), kde=True)
        plt.title(f'Distribution of {col}', fontsize=14)
        plt.xlabel(col, fontsize=12)
        plt.ylabel('Frequency', fontsize=12)
    plt.tight_layout()
    plt.show()
else:
    print("No numerical features found to plot.")

### 2.6 Outlier Detection in Key Features

In [None]:
# Create boxplots for key numerical features to detect outliers
if key_features:
    plt.figure(figsize=(18, 5 * ((len(key_features) + 2) // 3)))
    for i, col in enumerate(key_features, 1):
        plt.subplot((len(key_features) + 2) // 3, 3, i)
        sns.boxplot(y=df[col].dropna())
        plt.title(f'Boxplot of {col} (Outlier Detection)', fontsize=14)
        plt.ylabel(col, fontsize=12)
    plt.tight_layout()
    plt.show()
    
    # Calculate and display IQR statistics for key features
    outlier_stats = []
    for col in key_features:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers_count = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col].count()
        outliers_percentage = outliers_count / df[col].count() * 100
        
        outlier_stats.append({
            'Feature': col,
            'Q1': Q1,
            'Q3': Q3,
            'IQR': IQR,
            'Lower Bound': lower_bound,
            'Upper Bound': upper_bound,
            'Outliers Count': outliers_count,
            'Outliers %': f"{outliers_percentage:.2f}%"
        })
    
    print("Outlier Statistics:")
    display(pd.DataFrame(outlier_stats))
else:
    print("No numerical features found for outlier detection.")

### 2.7 Correlation Analysis

In [None]:
# Calculate correlation matrix for numerical features
if len(numerical_cols) > 1:
    # Include the target if it's numerical
    if target_col and target_col in df.columns and df[target_col].dtype in ['int64', 'float64']:
        cols_for_corr = numerical_cols + [target_col]
    else:
        cols_for_corr = numerical_cols
    
    # Create correlation matrix
    corr_matrix = df[cols_for_corr].corr()
    
    # Plot the correlation heatmap
    plt.figure(figsize=(14, 10))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', fmt='.2f', 
                linewidths=0.5, vmin=-1, vmax=1)
    plt.title('Correlation Matrix of Numerical Features', fontsize=16)
    plt.tight_layout()
    plt.show()
    
    # Get top correlated pairs (excluding self-correlations)
    corr_pairs = []
    
    # Extract correlation pairs
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            col_i = corr_matrix.columns[i]
            col_j = corr_matrix.columns[j]
            correlation = corr_matrix.iloc[i, j]
            corr_pairs.append((col_i, col_j, abs(correlation), correlation))
    
    # Sort by absolute correlation
    corr_pairs.sort(key=lambda x: x[2], reverse=True)
    
    # Print top 10 correlated pairs
    print("Top 10 Correlated Feature Pairs:")
    for i, (col1, col2, abs_corr, corr) in enumerate(corr_pairs[:10], 1):
        print(f"{i}. {col1} — {col2}: {corr:.4f}")
else:
    print("Not enough numerical columns for correlation analysis.")

### 2.8 Categorical Features Analysis

In [None]:
# Get categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

if categorical_cols:
    # Calculate cardinality (number of unique values) for each categorical column
    cat_cardinality = []
    for col in categorical_cols:
        unique_values = df[col].nunique()
        example_values = ', '.join(df[col].dropna().unique()[:3].astype(str)) + '...'
        cat_cardinality.append({
            'Column': col, 
            'Cardinality': unique_values,
            'Example Values': example_values
        })
    
    # Display cardinality information
    cat_card_df = pd.DataFrame(cat_cardinality).sort_values('Cardinality', ascending=False)
    print("Categorical columns and their cardinality (number of unique values):")
    display(cat_card_df)
    
    # Plot distribution for categorical columns with low cardinality
    low_card_cols = [col for col in categorical_cols if df[col].nunique() <= 10]
    
    if low_card_cols:
        for col in low_card_cols[:5]:  # Limit to 5 columns to avoid too many plots
            plt.figure(figsize=(10, 6))
            value_counts = df[col].value_counts().sort_values(ascending=False)
            
            sns.barplot(x=value_counts.index, y=value_counts.values)
            plt.title(f'Distribution of {col}', fontsize=14)
            plt.xlabel(col, fontsize=12)
            plt.ylabel('Count', fontsize=12)
            plt.xticks(rotation=45, ha='right')
            
            # Add percentage labels
            total = len(df)
            for i, v in enumerate(value_counts):
                plt.text(i, v + total*0.01, f"{v} ({v/total*100:.1f}%)", ha='center')
                
            plt.tight_layout()
            plt.show()
else:
    print("No categorical columns found in the dataset.")

### 2.9 Save EDA Outputs

In [None]:
# Ensure results directory exists
import os
results_dir = os.path.join('..', 'results')
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

# 1. Save missing values information to CSV
missing_values = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percent})
missing_csv_path = os.path.join(results_dir, 'missing_values.csv')
missing_df.to_csv(missing_csv_path)
print(f"Missing values data saved to {missing_csv_path}")

# 2. Save EDA report as HTML
try:
    import pandas_profiling
    from pandas_profiling import ProfileReport
    
    # Generate profile report
    profile = ProfileReport(df, title="Loan Default Dataset Profiling Report", minimal=True)
    
    # Save report to file
    eda_report_path = os.path.join(results_dir, 'eda_report.html')
    profile.to_file(eda_report_path)
    print(f"EDA report saved to {eda_report_path}")
    
except ImportError:
    print("pandas-profiling not installed. Installing it would allow generation of comprehensive EDA reports.")
    print("Install with: pip install pandas-profiling")
    
    # Alternative: save a basic HTML summary
    import pandas as pd
    
    # Create a simple HTML report
    html_content = f"""
    <html>
    <head>
        <title>Loan Default EDA Summary</title>
        <style>
            body {{ font-family: Arial, sans-serif; margin: 20px; }}
            table {{ border-collapse: collapse; width: 100%; }}
            th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
            th {{ background-color: #f2f2f2; }}
            tr:nth-child(even) {{ background-color: #f9f9f9; }}
        </style>
    </head>
    <body>
        <h1>Loan Default Dataset EDA Summary</h1>
        <p>Dataset shape: {df.shape[0]} rows and {df.shape[1]} columns</p>
        
        <h2>Data Types</h2>
        {df.dtypes.value_counts().to_frame().to_html()}
        
        <h2>Missing Values Summary</h2>
        {missing_df[missing_df['Missing Values'] > 0].sort_values('Missing Values', ascending=False).to_html()}
        
        <h2>Numerical Features Summary</h2>
        {df.describe().to_html()}
    </body>
    </html>
    """
    
    # Save the HTML file
    eda_report_path = os.path.join(results_dir, 'eda_report.html')
    with open(eda_report_path, 'w') as f:
        f.write(html_content)
    print(f"Simple EDA report saved to {eda_report_path}")

## 3. EDA Summary

This exploratory data analysis has:

1. Loaded and verified the Loan Default dataset
2. Analyzed the target variable distribution
3. Calculated summary statistics for numerical features
4. Detected suspicious placeholder values
5. Visualized distributions of key numerical features
6. Identified outliers in numerical features
7. Analyzed correlations between features
8. Examined categorical features and their cardinality
9. Saved EDA outputs to files

Key insights:
- The dataset has 148,670 rows and 34 columns
- 14 columns contain missing values (ranging from 0.03% to 26.66%)
- The target variable is "Status" with binary values (0/1)
- Several numerical features show significant outliers
- The dataset contains both numerical and categorical features

Next steps:
- Apply appropriate preprocessing based on these findings
- Handle missing values and outliers
- Encode categorical features
- Select features based on correlation analysis
- Apply feature engineering techniques
- Prepare data for model training

In [None]:
# Check the data types and non-null counts
df.info()

In [None]:
# Get statistical summary of numerical variables
df.describe().T

## 3. Check for Missing Values

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percent
})

missing_df = missing_df[missing_df['Missing Values'] > 0].sort_values('Missing Values', ascending=False)
print("Missing Values:")
if len(missing_df) > 0:
    print(missing_df)
else:
    print("No missing values found in the dataset.")

In [None]:
# Visualize missing values if there are any
if len(missing_df) > 0:
    plt.figure(figsize=(12, 6))
    plt.bar(missing_df.index, missing_df['Percentage'], color='crimson')
    plt.title('Percentage of Missing Values by Column', fontsize=16)
    plt.xlabel('Columns', fontsize=12)
    plt.ylabel('Percentage', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## 4. Target Variable Analysis

In [None]:
# Assuming 'loan_default' is the target variable, check its distribution
if 'loan_default' in df.columns:
    target_col = 'loan_default'
    target_counts = df[target_col].value_counts()
    target_percents = df[target_col].value_counts(normalize=True) * 100
    
    print(f"Target Variable: {target_col}")
    print("\nCounts:")
    print(target_counts)
    print("\nPercentages:")
    print(target_percents)
    
    # Visualize target distribution
    plt.figure(figsize=(10, 6))
    sns.countplot(x=target_col, data=df, palette=['#66b3ff', '#ff9999'])
    plt.title(f'Distribution of {target_col}', fontsize=16)
    plt.xlabel(target_col, fontsize=12)
    plt.ylabel('Count', fontsize=12)
    
    # Add percentages above bars
    for i, p in enumerate(target_percents):
        plt.text(i, target_counts.values[i] + 500, f'{p:.1f}%', 
                 ha='center', fontsize=12)
    
    plt.tight_layout()
    plt.show()
else:
    print("Target variable 'loan_default' not found in dataset.")

## 5. Univariate Analysis of Key Features

### 5.1 Numerical Features

In [None]:
# Get numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

if 'loan_default' in numerical_cols:
    numerical_cols.remove('loan_default')  # Remove target if it's numerical

# Plot histograms for numerical columns
if numerical_cols:
    plt.figure(figsize=(20, 15))
    for i, col in enumerate(numerical_cols[:12], 1):  # Limit to first 12 columns if many
        plt.subplot(4, 3, i)
        sns.histplot(df[col], kde=True)
        plt.title(f'Distribution of {col}', fontsize=12)
        plt.xlabel(col)
        plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    
    # If there are more than 12 numerical columns, plot the rest
    if len(numerical_cols) > 12:
        plt.figure(figsize=(20, 15))
        for i, col in enumerate(numerical_cols[12:24], 1):
            plt.subplot(4, 3, i)
            sns.histplot(df[col], kde=True)
            plt.title(f'Distribution of {col}', fontsize=12)
            plt.xlabel(col)
            plt.ylabel('Frequency')
        plt.tight_layout()
        plt.show()
else:
    print("No numerical columns found in the dataset.")

In [None]:
# Create box plots for numerical variables to detect outliers
if numerical_cols:
    plt.figure(figsize=(20, 15))
    for i, col in enumerate(numerical_cols[:12], 1):
        plt.subplot(4, 3, i)
        sns.boxplot(y=df[col])
        plt.title(f'Boxplot of {col}', fontsize=12)
    plt.tight_layout()
    plt.show()
    
    # If there are more than 12 numerical columns, plot the rest
    if len(numerical_cols) > 12:
        plt.figure(figsize=(20, 15))
        for i, col in enumerate(numerical_cols[12:24], 1):
            plt.subplot(4, 3, i)
            sns.boxplot(y=df[col])
            plt.title(f'Boxplot of {col}', fontsize=12)
        plt.tight_layout()
        plt.show()

### 5.2 Categorical Features

In [None]:
# Get categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Plot bar charts for categorical variables
if categorical_cols:
    for col in categorical_cols:
        plt.figure(figsize=(12, 6))
        value_counts = df[col].value_counts().sort_values(ascending=False)
        
        # If too many categories, only plot the top 20
        if len(value_counts) > 20:
            value_counts = value_counts.head(20)
            plt.title(f'Top 20 Categories in {col}', fontsize=16)
        else:
            plt.title(f'Distribution of {col}', fontsize=16)
            
        sns.barplot(x=value_counts.index, y=value_counts.values)
        plt.xlabel(col, fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        
        # Add count labels on top of bars
        for i, v in enumerate(value_counts.values):
            plt.text(i, v + 5, str(v), ha='center')
            
        plt.tight_layout()
        plt.show()
        
        # Print percentage distribution
        print(f"\nPercentage distribution of {col}:")
        print(df[col].value_counts(normalize=True).sort_values(ascending=False) * 100)
        print("-" * 50)
else:
    print("No categorical columns found in the dataset.")

## 6. Bivariate Analysis (Relationship with Target)

### 6.1 Numerical Features vs. Target

In [None]:
# Check if target exists and is binary
if 'loan_default' in df.columns:
    target = 'loan_default'
    
    # For numerical features vs target
    if numerical_cols:
        for col in numerical_cols[:6]:  # Limit to first 6 features for brevity
            plt.figure(figsize=(12, 6))
            
            # Create violin plots
            sns.violinplot(x=target, y=col, data=df, palette=['#66b3ff', '#ff9999'])
            plt.title(f'Distribution of {col} by {target}', fontsize=16)
            plt.xlabel(target, fontsize=12)
            plt.ylabel(col, fontsize=12)
            plt.tight_layout()
            plt.show()
            
            # Print group statistics
            print(f"\nStatistics of {col} grouped by {target}:")
            print(df.groupby(target)[col].describe())
            print("-" * 50)
else:
    print("Target variable 'loan_default' not found in dataset.")

### 6.2 Categorical Features vs. Target

In [None]:
# For categorical features vs target
if 'loan_default' in df.columns and categorical_cols:
    target = 'loan_default'
    
    for col in categorical_cols:
        # Create a cross-tabulation of the categorical variable and target
        crosstab = pd.crosstab(df[col], df[target])
        
        # Calculate percentages
        crosstab_percent = pd.crosstab(df[col], df[target], normalize='index') * 100
        
        # Plot stacked bar chart
        plt.figure(figsize=(12, 6))
        crosstab_percent.plot(kind='bar', stacked=True, 
                              color=['#66b3ff', '#ff9999'], 
                              figsize=(12, 6))
        
        plt.title(f'Percentage of {target} by {col}', fontsize=16)
        plt.xlabel(col, fontsize=12)
        plt.ylabel(f'Percentage', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.legend(title=target)
        plt.tight_layout()
        plt.show()
        
        # Print crosstab and chi-square test results
        print(f"\nCross-tabulation of {col} vs {target}:")
        print(crosstab)
        print("\nPercentage within each category:")
        print(crosstab_percent)
        
        # Perform chi-square test to check if there's a significant relationship
        chi2, p, dof, expected = chi2_contingency(crosstab)
        print(f"\nChi-square test: chi2 = {chi2:.4f}, p-value = {p:.4f}")
        if p < 0.05:
            print(f"There is a significant relationship between {col} and {target} (p < 0.05)")
        else:
            print(f"No significant relationship between {col} and {target} (p >= 0.05)")
        
        print("-" * 50)

## 7. Correlation Analysis

In [None]:
# Calculate correlation matrix for numerical features
if numerical_cols:
    # Include the target if it's numerical or convert it if it's binary
    if 'loan_default' in df.columns:
        corr_data = df[numerical_cols + ['loan_default']]
    else:
        corr_data = df[numerical_cols]
    
    corr_matrix = corr_data.corr()
    
    # Plot the correlation matrix
    plt.figure(figsize=(16, 12))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', fmt='.2f', 
                linewidths=0.5, vmin=-1, vmax=1)
    plt.title('Correlation Matrix of Numerical Features', fontsize=16)
    plt.tight_layout()
    plt.show()
    
    # If target exists, show features most correlated with target
    if 'loan_default' in df.columns and 'loan_default' in corr_matrix.columns:
        target_corr = corr_matrix['loan_default'].drop('loan_default').sort_values(ascending=False)
        
        plt.figure(figsize=(12, 8))
        sns.barplot(x=target_corr.values, y=target_corr.index, palette='viridis')
        plt.title('Features Correlation with Target (loan_default)', fontsize=16)
        plt.xlabel('Correlation Coefficient', fontsize=12)
        plt.ylabel('Features', fontsize=12)
        plt.axvline(x=0, color='r', linestyle='-', alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        # Print top correlated features
        print("Top features positively correlated with loan_default:")
        print(target_corr.head(10))
        
        print("\nTop features negatively correlated with loan_default:")
        print(target_corr.tail(10).sort_values())
else:
    print("No numerical columns found for correlation analysis.")

## 8. Feature Engineering Ideas

In [None]:
# Based on the EDA, we can suggest some feature engineering ideas
# This cell doesn't actually create these features but demonstrates how they could be created

# Example feature engineering (uncomment and modify to use)
# 1. Create a debt-to-income ratio feature if not already present
if 'debt_to_income' not in df.columns and all(col in df.columns for col in ['total_debt', 'annual_income']):
    print("Creating debt-to-income ratio feature...")
    df['debt_to_income'] = df['total_debt'] / df['annual_income']
    print("Feature created.")
else:
    print("Either debt_to_income already exists or required columns are not present.")

# 2. Create loan amount to income ratio if applicable columns exist
if all(col in df.columns for col in ['loan_amount', 'annual_income']):
    print("\nCreating loan-to-income ratio feature...")
    df['loan_to_income'] = df['loan_amount'] / df['annual_income']
    print("Feature created.")
else:
    print("\nCannot create loan-to-income ratio: required columns not found.")

# 3. Binning numerical features like credit score, income, etc.
if 'credit_score' in df.columns:
    print("\nCreating credit score bins...")
    df['credit_score_bin'] = pd.cut(df['credit_score'], 
                                   bins=[0, 580, 670, 740, 800, float('inf')],
                                   labels=['Very Poor', 'Fair', 'Good', 'Very Good', 'Excellent'])
    print("Feature created.")
    print(df['credit_score_bin'].value_counts())
else:
    print("\nCannot create credit score bins: credit_score column not found.")

## 9. Key Findings and Next Steps

### Key Findings

Based on our exploratory data analysis, here are the key findings:

1. **Data Overview**: 
   - The dataset contains information about loans and their default status.
   - [Describe actual shape and features once examined]

2. **Target Variable**:
   - The target variable is loan_default (0 = no default, 1 = default).
   - [Note the class distribution once examined - is it imbalanced?]

3. **Important Features**:
   - [List key features that showed strong correlation with the target]
   - [Note any patterns observed in the analysis]

4. **Data Quality**:
   - [Summarize findings about missing values]
   - [Summarize findings about outliers]

5. **Relationships**:
   - [Summarize key relationships between features and target]
   - [Note any interesting patterns in categorical variables]

### Next Steps

1. **Data Preprocessing**:
   - Handle missing values through imputation or removal
   - Handle outliers through capping, removal, or transformation
   - Convert categorical variables to numerical through encoding techniques

2. **Feature Engineering**:
   - Create the suggested engineered features
   - Consider additional interaction features between correlated variables

3. **Model Development**:
   - Implement class balancing techniques if target is imbalanced
   - Split data into training and testing sets
   - Train various models (Logistic Regression, Random Forest, XGBoost, etc.)
   - Tune hyperparameters for optimal performance

4. **Model Evaluation**:
   - Evaluate models using appropriate metrics (AUC-ROC, F1-Score, Precision, Recall)
   - Perform cross-validation to ensure model robustness

5. **Model Interpretation**:
   - Use feature importance and SHAP values to explain model predictions
   - Create visualizations for model interpretation