## Load Data and Basic Info

In [204]:
import pandas as pd

def load_data(file_path):
    df = pd.read_csv(file_path)
    print("✅ Data loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    return df

def basic_info(df):
    print("\n===== BASIC DATASET INFORMATION =====")
    print(f"Rows: {df.shape[0]:,}")
    print(f"Columns: {df.shape[1]}")
    print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    print("\nColumn Data Types:")
    print(df.dtypes.value_counts())
    print("\nSample Data Preview:")
    print(df.head())

# Usage example
data_path = "data/filtered_loan_application.csv"  # update if necessary
df = load_data(data_path)
basic_info(df)


✅ Data loaded successfully!
Shape: (48974, 21)
Columns: ['application_id', 'customer_id', 'application_date', 'loan_type', 'loan_tenure_months', 'interest_rate_offered', 'purpose_of_loan', 'employment_status', 'monthly_income', 'cibil_score', 'existing_emis_monthly', 'debt_to_income_ratio', 'property_ownership_status', 'residential_address', 'applicant_age', 'gender', 'number_of_dependents', 'loan_amount_requested', 'fraud_flag', 'fraud_type', 'loan_status']

===== BASIC DATASET INFORMATION =====
Rows: 48,974
Columns: 21
Memory Usage: 39.03 MB

Column Data Types:
object     10
int64       8
float64     3
Name: count, dtype: int64

Sample Data Preview:
                         application_id customer_id application_date  \
0  c8bf0bea-70e6-4870-9125-41b8210c527f  CUST109427       09/04/2023   
1  91224cec-3544-4bc7-ac15-a9792da54c02  CUST106146       23/09/2023   
2  4efcd02d-4a03-4ab7-9bd1-0ff430493d0c  CUST100674       22/05/2023   
3  a61337d4-ba04-4a68-b492-2cb8266e6ed7  CUST106466 

In [205]:
df.head()

Unnamed: 0,application_id,customer_id,application_date,loan_type,loan_tenure_months,interest_rate_offered,purpose_of_loan,employment_status,monthly_income,cibil_score,...,debt_to_income_ratio,property_ownership_status,residential_address,applicant_age,gender,number_of_dependents,loan_amount_requested,fraud_flag,fraud_type,loan_status
0,c8bf0bea-70e6-4870-9125-41b8210c527f,CUST109427,09/04/2023,Business Loan,12,11.66,Medical Emergency,Retired,34700,714,...,0.0317,Rented,"94/31, Sehgal Zila, Vadodara-380521, Anantapur...",28,Female,3,604000,0,,Approved
1,91224cec-3544-4bc7-ac15-a9792da54c02,CUST106146,23/09/2023,Car Loan,240,13.62,Education,Unemployed,51600,667,...,0.0,Owned,"H.No. 00, Sheth Chowk, Ichalkaranji 006728, Im...",44,Other,3,100000,0,,Approved
2,4efcd02d-4a03-4ab7-9bd1-0ff430493d0c,CUST100674,22/05/2023,Education Loan,60,11.4,Medical Emergency,Self-Employed,14800,808,...,0.3108,Rented,"H.No. 81, Dutta Path, Kozhikode-340301, Tadepa...",56,Other,4,431000,0,,Approved
3,a61337d4-ba04-4a68-b492-2cb8266e6ed7,CUST106466,09/07/2024,Car Loan,120,10.36,Debt Consolidation,Self-Employed,28800,647,...,0.1389,Rented,"H.No. 022, Rege Road, Tiruvottiyur-927857, Aur...",27,Other,4,324000,0,,Declined
4,a8d1639e-170b-41b2-826a-55c7dae38d16,CUST112319,20/11/2023,Personal Loan,36,14.14,Business Expansion,Salaried,43900,624,...,0.0251,Rented,"85/24, Bali Zila, Sambalpur 922071, Tumkur, Ke...",50,Other,0,100000,0,,Declined


In [206]:
df.loan_status.value_counts()

loan_status
Approved    40882
Declined     8092
Name: count, dtype: int64

## Handling Outliers

In [207]:
# Winsorize (cap) at 0.5th and 99.5th percentiles
numerical_cols = [
    'loan_tenure_months','monthly_income','interest_rate_offered',
    'cibil_score', 'existing_emis_monthly', 'applicant_age',
    'number_of_dependents', 'loan_amount_requested'
]

for col in numerical_cols:
    lower = df[col].quantile(0.05)
    upper = df[col].quantile(0.95)
    df[col] = df[col].clip(lower=lower, upper=upper)
    print(f"{col}: capped at [{lower:.0f}, {upper:.0f}]")


loan_tenure_months: capped at [12, 360]
monthly_income: capped at [10000, 91100]
interest_rate_offered: capped at [7, 14]
cibil_score: capped at [617, 781]
existing_emis_monthly: capped at [0, 6300]
applicant_age: capped at [23, 63]
number_of_dependents: capped at [0, 4]
loan_amount_requested: capped at [100000, 995000]


In [208]:
df.columns

Index(['application_id', 'customer_id', 'application_date', 'loan_type',
       'loan_tenure_months', 'interest_rate_offered', 'purpose_of_loan',
       'employment_status', 'monthly_income', 'cibil_score',
       'existing_emis_monthly', 'debt_to_income_ratio',
       'property_ownership_status', 'residential_address', 'applicant_age',
       'gender', 'number_of_dependents', 'loan_amount_requested', 'fraud_flag',
       'fraud_type', 'loan_status'],
      dtype='object')

## Drop Unnecessary Columns

In [209]:
# Drop IDs and highly missing/fraud columns for ML
cols_to_drop = [
    'application_id', 'customer_id', 'application_date',
    'residential_address', 'fraud_flag', 'fraud_type', 
    'purpose_of_loan','interest_rate_offered','loan_tenure_months',
    'gender','number_of_dependents' 
]
df_clean = df.drop(columns=cols_to_drop, errors='ignore')
print("Columns after cleanup:", list(df_clean.columns))


Columns after cleanup: ['loan_type', 'employment_status', 'monthly_income', 'cibil_score', 'existing_emis_monthly', 'debt_to_income_ratio', 'property_ownership_status', 'applicant_age', 'loan_amount_requested', 'loan_status']


In [210]:
df_clean

Unnamed: 0,loan_type,employment_status,monthly_income,cibil_score,existing_emis_monthly,debt_to_income_ratio,property_ownership_status,applicant_age,loan_amount_requested,loan_status
0,Business Loan,Retired,34700,714,1100,0.0317,Rented,28,604000,Approved
1,Car Loan,Unemployed,51600,667,0,0.0000,Owned,44,100000,Approved
2,Education Loan,Self-Employed,14800,781,4600,0.3108,Rented,56,431000,Approved
3,Car Loan,Self-Employed,28800,647,4000,0.1389,Rented,27,324000,Declined
4,Personal Loan,Salaried,43900,624,1100,0.0251,Rented,50,100000,Declined
...,...,...,...,...,...,...,...,...,...,...
48969,Education Loan,Student,49200,736,2400,0.0488,Rented,55,381000,Approved
48970,Education Loan,Student,28600,738,4000,0.1399,Jointly Owned,61,736000,Approved
48971,Car Loan,Student,42000,781,600,0.0143,Jointly Owned,48,436000,Approved
48972,Business Loan,Business Owner,46700,759,6300,0.1734,Rented,62,827000,Approved


## Encode Categorical Features

In [211]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

def encode_categorical_fixed(df, categorical_cols, categories_dict):
    df_encoded = df_clean.copy()
    encoders = {}

    for col in categorical_cols:
        le = LabelEncoder()
        fixed_classes = np.array(categories_dict[col])
        le.fit(fixed_classes)
        
        # Verify that all values in df[col] are known categories
        unique_vals = df_encoded[col].unique()
        unseen_vals = set(unique_vals) - set(fixed_classes)
        if unseen_vals:
            raise ValueError(f"Unexpected categories in column '{col}': {unseen_vals}")

        df_encoded[col] = le.transform(df_encoded[col])
        encoders[col] = le
        print(f"Encoded {col}: {list(le.classes_)}")

    return df_encoded, encoders

# Example usage:
categorical_cols = ['loan_type', 'employment_status', 'property_ownership_status']

known_categories = {
    'loan_type': ['Business Loan', 'Car Loan', 'Education Loan', 'Home Loan', 'Personal Loan'],
    'employment_status': ['Business Owner', 'Retired', 'Salaried', 'Self-Employed', 'Student', 'Unemployed'],
    'property_ownership_status': ['Jointly Owned', 'Owned', 'Rented']
}

# Assuming df is your original DataFrame
df_encoded, encoders = encode_categorical_fixed(df, categorical_cols, known_categories)



Encoded loan_type: [np.str_('Business Loan'), np.str_('Car Loan'), np.str_('Education Loan'), np.str_('Home Loan'), np.str_('Personal Loan')]
Encoded employment_status: [np.str_('Business Owner'), np.str_('Retired'), np.str_('Salaried'), np.str_('Self-Employed'), np.str_('Student'), np.str_('Unemployed')]
Encoded property_ownership_status: [np.str_('Jointly Owned'), np.str_('Owned'), np.str_('Rented')]


In [212]:
df_encoded.head()

Unnamed: 0,loan_type,employment_status,monthly_income,cibil_score,existing_emis_monthly,debt_to_income_ratio,property_ownership_status,applicant_age,loan_amount_requested,loan_status
0,0,1,34700,714,1100,0.0317,2,28,604000,Approved
1,1,5,51600,667,0,0.0,1,44,100000,Approved
2,2,3,14800,781,4600,0.3108,2,56,431000,Approved
3,1,3,28800,647,4000,0.1389,2,27,324000,Declined
4,4,2,43900,624,1100,0.0251,2,50,100000,Declined


## Encode Target Variable

In [213]:
# 3. Encode target variable with explicit mapping (e.g., Approved=1, Declined=0)
df_encoded['loan_status'] = df_encoded['loan_status'].map({'Approved':1, 'Declined':0})


In [214]:
df_encoded.head()

Unnamed: 0,loan_type,employment_status,monthly_income,cibil_score,existing_emis_monthly,debt_to_income_ratio,property_ownership_status,applicant_age,loan_amount_requested,loan_status
0,0,1,34700,714,1100,0.0317,2,28,604000,1
1,1,5,51600,667,0,0.0,1,44,100000,1
2,2,3,14800,781,4600,0.3108,2,56,431000,1
3,1,3,28800,647,4000,0.1389,2,27,324000,0
4,4,2,43900,624,1100,0.0251,2,50,100000,0


## Prepare features and target matrices

In [215]:

features = [
    'loan_type', 'employment_status', 'monthly_income', 'cibil_score',
    'existing_emis_monthly', 'debt_to_income_ratio', 'property_ownership_status',
    'applicant_age', 'loan_amount_requested'
]

X = df_encoded[features]
y = df_encoded['loan_status']

print("Features used:", features)
print("X shape:", X.shape)
print("y class balance:\n", y.value_counts())

Features used: ['loan_type', 'employment_status', 'monthly_income', 'cibil_score', 'existing_emis_monthly', 'debt_to_income_ratio', 'property_ownership_status', 'applicant_age', 'loan_amount_requested']
X shape: (48974, 9)
y class balance:
 loan_status
1    40882
0     8092
Name: count, dtype: int64


## Scale numerical features  

## train/test split and prepare data for model training


In [216]:
from sklearn.model_selection import train_test_split

# 6. Split data into training and testing sets (stratified to maintain class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Training class distribution:\n", y_train.value_counts())
print("Test class distribution:\n", y_test.value_counts())


Train shape: (39179, 9) Test shape: (9795, 9)
Training class distribution:
 loan_status
1    32705
0     6474
Name: count, dtype: int64
Test class distribution:
 loan_status
1    8177
0    1618
Name: count, dtype: int64


## Save Processed Data

In [217]:
import os

# Create the directory 'data/processed' if it doesn't exist
os.makedirs("data/processed", exist_ok=True)

# Save the train and test feature datasets
X_train.to_csv("data/processed/X_train.csv", index=False)
X_test.to_csv("data/processed/X_test.csv", index=False)

# Save the train and test target datasets
y_train.to_csv("data/processed/y_train.csv", index=False)
y_test.to_csv("data/processed/y_test.csv", index=False)




print("✅ Processed data files saved.")


✅ Processed data files saved.
