In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import joblib

import os

## Load Dataset

In [2]:
file_path = "../data/raw/credit_risk_dataset.csv"
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


## Preparing the data

In [4]:
def prepare_data(file_path):
    # Load dataset
    df = pd.read_csv(file_path)

    # Handling Missing Values
    df['person_emp_length'].fillna(df['person_emp_length'].median(), inplace=True)
    df['loan_int_rate'].fillna(df['loan_int_rate'].mean(), inplace=True)

    # Outlier Handling
    def winsorize_column(df, column, lower=0.01, upper=0.99):
        lower_bound = df[column].quantile(lower)
        upper_bound = df[column].quantile(upper)
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
        return df

    for col in ['person_income', 'loan_amnt', 'loan_percent_income']:
        df = winsorize_column(df, col)
    
    # Save data
    df.to_csv('../data/processed/df_cleaned.csv', index=False)

    # Encoding Categorical Variables
    categorical_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
    ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
    encoded_array = ohe.fit_transform(df[categorical_cols])
    encoded_df = pd.DataFrame(encoded_array, columns=ohe.get_feature_names_out(categorical_cols), index=df.index)
    df = df.drop(columns=categorical_cols)
    df = pd.concat([df, encoded_df], axis=1)

    # Save encoder
    joblib.dump(ohe, '../models/one_hot_encoder.joblib')

    # Feature Scaling
    numeric_cols = [
    'person_age', 'person_income', 'person_emp_length',
    'loan_amnt', 'loan_int_rate', 'loan_percent_income',
    'cb_person_cred_hist_length'
    ]
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # Save scaler for real-time inference
    joblib.dump(scaler, '../models/standard_scaler.joblib')

    # Defining Features and Target
    X = df.drop('loan_status', axis=1)
    y = df['loan_status']

    # Train-Test=Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    # Handling Class Imbalance
    smote = SMOTE(random_state=42)
    X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

    # Define the output directory
    output_dir = "../data/processed"
    os.makedirs(output_dir, exist_ok=True)

    # Save the datasets
    X_train.to_csv(os.path.join(output_dir, 'X_train.csv'), index=False)
    X_test.to_csv(os.path.join(output_dir, 'X_test.csv'), index=False)
    y_train.to_csv(os.path.join(output_dir, 'y_train.csv'), index=False)
    y_test.to_csv(os.path.join(output_dir, 'y_test.csv'), index=False)
    X_train_bal.to_csv(os.path.join(output_dir, 'X_train_bal.csv'), index=False)
    y_train_bal.to_csv(os.path.join(output_dir, 'y_train_bal.csv'), index=False)
    df.to_csv(os.path.join(output_dir, 'df_processed.csv'), index=False)

    return df, X_train, X_test, y_train, y_test, X_train_bal, y_train_bal

In [5]:
df, X_train, X_test, y_train, y_test, X_train_bal, y_train_bal = prepare_data(file_path=file_path)

In [6]:
df

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y
0,-0.903374,-0.142161,28.926614,3.263893,1.625251,1,3.150782,-0.691554,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,-1.060904,-1.323277,0.056763,-1.379575,0.041636,0,-0.665338,-0.938167,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.430783,-1.323277,-0.921876,-0.654033,0.603041,1,3.150782,-0.691554,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-0.745843,0.029974,-0.187897,3.263893,1.368887,1,3.150782,-0.938167,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-0.588313,-0.263981,0.790742,3.263893,1.057357,1,3.150782,-0.444942,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,4.610190,-0.301056,-0.921876,-0.605664,0.697149,0,-0.569935,5.966992,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
32577,4.137599,1.473265,-0.187897,1.300899,-1.142830,0,-0.188323,3.254251,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32578,5.870433,0.308039,-0.432557,3.263893,-0.007040,1,2.769170,5.473767,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
32579,4.452660,2.267738,0.056763,0.877666,0.151970,0,-0.665338,4.980541,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
