# Final Submission: Loan Dataset Analysis and Modeling

In [None]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

## Step 1: Load and Inspect Data

### Description:
We first load the dataset and check its structure to identify missing values, categorical columns, and initial patterns.


In [None]:
# Load dataset
file_path = 'path_to_training_dataset.csv'  # Replace with your file path
train_data = pd.read_csv(file_path)

# Inspect dataset
print("Dataset Overview:")
print(train_data.info())
print(train_data.head())

## Step 2: Data Cleaning

### Description:
Remove unnecessary columns, handle missing values, and clean categorical data.

#### Actions:
- Drop irrelevant columns.
- Clean categorical columns by removing 'Unknown'.
- Impute missing numeric values with their median.


In [None]:
def preprocess_data(df):
    # Drop unnecessary columns
    columns_to_drop = ['id', 'member_id', 'desc', 'mths_since_last_major_derog', 'application_approved_flag']
    df = df.drop(columns=columns_to_drop, errors='ignore')

    # Clean and convert 'emp_length'
    def clean_emp_length(value):
        if pd.isnull(value) or value == "n/a":
            return np.nan
        elif "<" in value:
            return 0
        elif "10+" in value:
            return 10
        else:
            try:
                return int(value.split()[0])
            except:
                return np.nan

    if 'emp_length' in df.columns:
        df['emp_length'] = df['emp_length'].apply(clean_emp_length)

    # Impute numeric columns
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())

    # Drop rows with 'Unknown' in categorical columns
    categorical_columns = df.select_dtypes(include=['category', 'object']).columns
    for col in categorical_columns:
        df = df[df[col] != 'Unknown']

    return df

# Apply preprocessing
train_data_cleaned = preprocess_data(train_data)

## Step 3: Exploratory Data Analysis (EDA)

### Description:
Analyze data distributions and relationships. This helps identify trends, correlations, and potential predictors.


In [None]:
# Plot distributions for numeric features
numeric_columns = train_data_cleaned.select_dtypes(include=['float64', 'int64']).columns
plt.figure(figsize=(16, 12))
for i, col in enumerate(numeric_columns, 1):
    plt.subplot(4, 4, i)
    sns.histplot(train_data_cleaned[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(12, 7))
correlation_matrix = train_data_cleaned.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

## Step 4: Feature Engineering

### Description:
Encode categorical features, handle multicollinearity, and address class imbalance.

#### Actions:
- Use `OneHotEncoder` for categorical data.
- Drop redundant features.
- Apply SMOTE to balance the dataset.


In [None]:
# Separate features and target
X = train_data_cleaned.drop(columns=['bad_flag'])
y = train_data_cleaned['bad_flag']

# Encode categorical features
categorical_columns = X.select_dtypes(include=['category', 'object']).columns
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categorical = pd.DataFrame(
    encoder.fit_transform(X[categorical_columns]),
    columns=encoder.get_feature_names_out(categorical_columns)
)

# Drop original categorical columns and merge with encoded data
X_numeric = X.drop(columns=categorical_columns).reset_index(drop=True)
X_encoded = pd.concat([X_numeric, encoded_categorical], axis=1)

# Handle multicollinearity by dropping highly correlated features
X_encoded = X_encoded.drop(columns=['tot_hi_cred_lim'], errors='ignore')

# Apply SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_encoded, y)

# Check class distribution
print("Class Distribution After SMOTE:")
print(y_balanced.value_counts())

## Step 5: Train-Test Split

### Description:
Split the balanced dataset into training and validation sets for model development.


In [None]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Check dataset shapes
print(f"Training Data Shape: {X_train.shape}, Validation Data Shape: {X_val.shape}")