# Data Preprocessing

## Importing Packages

In [1]:
# Packages for EDA 
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd 
import numpy as np 

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from datasist.structdata import detect_outliers
# from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import category_encoders as ce
import re 

# Modeling and evaluation 
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
    StackingClassifier,
    HistGradientBoostingClassifier
)
from xgboost import XGBClassifier
from sklearn.metrics import classification_report 
import joblib



## Reading the Cleaned Data

In [2]:
df = pd.read_csv("../data/cleaned_train_data.csv")
df.head()

Unnamed: 0,Customer_ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,...,Monthly_Balance,Credit_Score,Credit-Builder Loan,Personal Loan,Debt Consolidation Loan,Student Loan,Payday Loan,Mortgage Loan,Auto Loan,Home Equity Loan
0,1.0,January,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,312.494089,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
1,1.0,February,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,284.629162,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
2,1.0,March,33.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,331.209863,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
3,1.0,April,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,223.45131,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
4,1.0,May,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,...,341.489231,Good,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


## Handling Catogerical Data

In [3]:
df.select_dtypes(include="object").head()

Unnamed: 0,Month,Occupation,Payment_of_Min_Amount,Payment_Behaviour,Credit_Score
0,January,Scientist,No,High_spent_Small_value_payments,Good
1,February,Scientist,No,Low_spent_Large_value_payments,Good
2,March,Scientist,No,Low_spent_Medium_value_payments,Good
3,April,Scientist,No,Low_spent_Small_value_payments,Good
4,May,Scientist,No,High_spent_Medium_value_payments,Good


In [4]:
df['Credit_Score'].value_counts()

Standard    53174
Poor        28998
Good        17828
Name: Credit_Score, dtype: int64

In [5]:
credit_score_mapping = {
    "Poor":0,
    "Standard":1,
    "Good":2
}

In [6]:
df['Credit_Score'] = df['Credit_Score'].map(credit_score_mapping)

In [7]:
del df['Customer_ID']

In [8]:
df = pd.get_dummies(df, drop_first = True)
df.head()

Unnamed: 0,Age,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,...,Occupation_Musician,Occupation_Scientist,Occupation_Teacher,Occupation_Writer,Payment_of_Min_Amount_Yes,Payment_Behaviour_High_spent_Medium_value_payments,Payment_Behaviour_High_spent_Small_value_payments,Payment_Behaviour_Low_spent_Large_value_payments,Payment_Behaviour_Low_spent_Medium_value_payments,Payment_Behaviour_Low_spent_Small_value_payments
0,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,7.0,11.27,...,0,1,0,0,0,0,1,0,0,0
1,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,-1.0,7.0,11.27,...,0,1,0,0,0,0,0,1,0,0
2,33.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,7.0,6.27,...,0,1,0,0,0,0,0,0,1,0
3,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,5.0,4.0,6.27,...,0,1,0,0,0,0,0,0,0,1
4,23.0,19114.12,1824.843333,3.0,4.0,3.0,4.0,6.0,7.0,11.27,...,0,1,0,0,0,1,0,0,0,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 54 columns):
 #   Column                                              Non-Null Count   Dtype  
---  ------                                              --------------   -----  
 0   Age                                                 100000 non-null  float64
 1   Annual_Income                                       100000 non-null  float64
 2   Monthly_Inhand_Salary                               100000 non-null  float64
 3   Num_Bank_Accounts                                   100000 non-null  float64
 4   Num_Credit_Card                                     100000 non-null  float64
 5   Interest_Rate                                       100000 non-null  float64
 6   Num_of_Loan                                         100000 non-null  float64
 7   Delay_from_due_date                                 100000 non-null  float64
 8   Num_of_Delayed_Payment                              100000 non-nu

In [None]:
# df.to_csv("Preprocessed_Data.csv")

In [None]:
# df = pd.read_csv("../input/credit-score-data-preprocessed/Preprocessed_Data.csv", low_memory = False)

### Data Spliting 
- Try Resampling 

In [10]:
# define dataset
X, y = df.drop("Credit_Score",axis=1).values , df["Credit_Score"]

In [11]:
X[0], y[0]

(array([2.30000000e+01, 1.91141200e+04, 1.82484333e+03, 3.00000000e+00,
        4.00000000e+00, 3.00000000e+00, 4.00000000e+00, 3.00000000e+00,
        7.00000000e+00, 1.12700000e+01, 4.00000000e+00, 2.00000000e+00,
        8.09980000e+02, 2.68226196e+01, 2.65000000e+02, 4.95749492e+01,
        8.04152954e+01, 3.12494089e+02, 1.00000000e+00, 1.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00]),
 2)

## Apply oversampling
- Data is Not 100% balanced, Let's try improving it. 

In [12]:
y.value_counts(normalize=True) * 100

1    53.174
0    28.998
2    17.828
Name: Credit_Score, dtype: float64

In [13]:
from imblearn.over_sampling import SMOTE
over_sampler = SMOTE(sampling_strategy='auto')
X_os, y_os = over_sampler.fit_resample(X, y)

In [15]:
y_os.value_counts(normalize=True) * 100

2    33.333333
1    33.333333
0    33.333333
Name: Credit_Score, dtype: float64

In [19]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_os, y_os, test_size=0.3, random_state=42,stratify=y_os)

### Handling Numerical 
- Using Power transformer to avoid Data Skewness

In [20]:
power_transformer = PowerTransformer(method='yeo-johnson', standardize=True).fit(X_train)

In [21]:
X_train = power_transformer.transform(X_train)
X_test = power_transformer.transform(X_test)

# Modeling and Evaluation


#### Model Building

In [22]:
bagging = BaggingClassifier(n_jobs=-1)
extraTrees = ExtraTreesClassifier(max_depth=10, n_jobs=-1)
randomForest = RandomForestClassifier(n_jobs=-1)
histGradientBoosting = HistGradientBoostingClassifier()
XGB = XGBClassifier(n_jobs=-1)

model = StackingClassifier([
    ('bagging', bagging),
    ('extraTress', extraTrees),
    ('randomforest', randomForest),
    ('histGradientBoosting', histGradientBoosting),
    ('XGB', XGB)
], n_jobs=-1)

#### Model fitting

In [23]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Model Evaluation

In [24]:
print("Train Score: ",model.score(X_train, y_train))

Train Score:  0.9992746160390453


In [25]:
print("Test Score: ",model.score(X_test, y_test))

Test Score:  0.8496771632154125


In [26]:
y_pred = model.predict(X_test)

In [27]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.84      0.87      0.86     15397
           1       0.81      0.79      0.80     16289
           2       0.90      0.89      0.89     16171

    accuracy                           0.85     47857
   macro avg       0.85      0.85      0.85     47857
weighted avg       0.85      0.85      0.85     47857



## Model Saving

In [29]:
# joblib.dump(model,'../models/model.h5')
# joblib.dump(power_transformer,'../models/power_transformer.h5')