In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Preprocessing

In [2]:
# Import CSV file
df = pd.read_csv('data/combined_data.csv')

In [3]:
# View columns to determine which to remove
df.columns

Index(['ID', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'DAYS_BIRTH',
       'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_WORK_PHONE', 'FLAG_PHONE',
       'FLAG_EMAIL', 'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'VINTAGE',
       'FINAL_MONTH', 'NUM_OF_MONTHS', 'NUM_OF_DLQ_MONTHS', 'DLQ_PERCENT',
       'RISK_FLAG'],
      dtype='object')

### Choose ONE of the three cells below:

In [None]:
# Drop unnecessary columns - Version 1. Engineered Variables: only include NUM_OF_DLQ_MONTHS, VINTAGE, FINAL_MONTH
df = df.drop(columns=['ID','FLAG_MOBIL','FLAG_WORK_PHONE','FLAG_PHONE','FLAG_EMAIL','DLQ_PERCENT', 'NUM_OF_MONTHS'])

In [4]:
# Drop unnecessary columns - Version 2. Engineered Variables: only include NUM_OF_DLQ_MONTHS, NUM_OF_MONTHS
df = df.drop(columns=['ID','FLAG_MOBIL','FLAG_WORK_PHONE','FLAG_PHONE','FLAG_EMAIL','DLQ_PERCENT','VINTAGE','FINAL_MONTH'])

In [None]:
# Drop unnecessary columns - Version 3. Only include Engineered Variables, Get rid of everything from application data
# except AMT_INCOME_TOTAL + DAYS_EMPLOYED
df = df.drop(columns=[col for col in df if col not in ['NUM_OF_MONTHS','VINTAGE','NUM_OF_DLQ_MONTHS','DLQ_PERCENT',\
                      'AMT_INCOME_TOTAL', 'DAYS_EMPLOYED', 'RISK_FLAG']])

### Resume Preprocessing:

In [5]:
# Fill NaN values with 'NULL'
df = df.fillna({'OCCUPATION_TYPE':'NULL'})

In [6]:
# Put categorical columns into list for encoding
categorical = df.dtypes[df.dtypes == 'object'].index.tolist()

In [7]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse_output= False)

# Fit and transform OneHotEncoder using categorical variables
encode_df = pd.DataFrame(enc.fit_transform(df[categorical]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(categorical)

In [8]:
# Use pd.conat to add columns with encoded values
preprocessed_df = pd.concat([df,encode_df],axis=1)

# Drop object columns
preprocessed_df = preprocessed_df.select_dtypes(exclude=['object'])

In [9]:
# Define X and y variables
y = preprocessed_df['RISK_FLAG']
X = preprocessed_df.drop(columns='RISK_FLAG')

In [10]:
# Split data into training and testing samples

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=47, test_size=0.4)

### Oversampling the minority class via `SMOTE`

In [11]:
# Initialize SMOTE to oversample the minority class in the training data
smote = SMOTE(random_state=47)

# Apply to training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [12]:
# Compare class distribution after SMOTE
print(y_train.value_counts())
print(y_train_resampled.value_counts())

RISK_FLAG
0.0    21163
1.0      711
Name: count, dtype: int64
RISK_FLAG
0.0    21163
1.0    21163
Name: count, dtype: int64


In [13]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train_resampled)

# Scaling data on SMOTE resampled training sets
X_train_scaled = X_scaler.transform(X_train_resampled)
X_test_scaled = X_scaler.transform(X_test)

## LogisticRegression Model

### Using X_train, X_test data

In [14]:
# Create a logistic Regression Model
lr_model = LogisticRegression(solver='lbfgs', max_iter= 1000, random_state=47)

# Display model
lr_model

In [32]:
# Train the model / time performance
start_time = time.perf_counter()

lr_model = lr_model.fit(X_train_scaled, y_train_resampled)

end_time = time.perf_counter()
total = end_time - start_time

print(f'Execution time: {total} seconds')

Execution time: 0.09487966692540795 seconds


In [16]:
# Making predictions using the testing data
lr_predictions = lr_model.predict(X_test_scaled)

In [17]:
# Display confusion matrix as dataframe
cm1 = confusion_matrix(y_test,lr_predictions)

pd.DataFrame(cm1, index=['Actual 0','Actual 1'],columns=['Predicted 0', 'Predicted 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14072,51
Actual 1,2,458


In [18]:
# Display classification report for logistic regression model
lr_class_report = classification_report(y_test, lr_predictions)

print(lr_class_report)

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     14123
         1.0       0.90      1.00      0.95       460

    accuracy                           1.00     14583
   macro avg       0.95      1.00      0.97     14583
weighted avg       1.00      1.00      1.00     14583



* Version 1+2: P 0.90, R 1.00, F1 0.95
* Version 3: P 0.87, R 1.00, F1 0.93

Version 3 has the least number of features. Lower precision and F1 score.

### Cross-Validation using `KFold`

In [19]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import recall_score, make_scorer

# Define pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()), # Standardize the training data for each fold
    ('classifier', lr_model)]) # use our LogisticRegression model

# Define score using make_scorer with a focus on the recall for the negative class (aka Sensitivity)
negative = make_scorer(recall_score, pos_label=0)

In [20]:
# Initialize KFold into 10 splits
kf = KFold(n_splits=10, shuffle=True, random_state=47)

In [21]:
# run cross_val_score on the entire dataset. Show total execution time
start_time = time.perf_counter()

lr_cv_scores = cross_val_score(pipeline, X, y, cv=kf, scoring= negative)

end_time = time.perf_counter()
total = end_time - start_time

print(f'Execution time: {total} seconds')

Execution time: 0.6310374580789357 seconds


In [31]:
print(f"Average Specificity Score {np.mean(lr_cv_scores)}")

Average Specificity Score 0.9990924599047716


## RandomForest Model

### Optimzing RandomForest using `GridSearchCV` - SKIP THIS PART

In [None]:
# Initialize RandomForestClassifier
rf_model = RandomForestClassifier(random_state=47)

In [None]:
# Define parameters for GridSearchCV
n_estimators = [100, 200, 500] # Higher number entails higher risk of overfitting
min_samples_split = [2, 5, 10] # Higher values may reduce overfitting
min_samples_leaf = [1, 2, 4] # Higher values may reduce overfitting

param_grid = {
    'n_estimators':n_estimators,
    'min_samples_split':min_samples_split,
    'min_samples_leaf':min_samples_leaf}

In [None]:
from sklearn.model_selection import GridSearchCV

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator = rf_model,
    param_grid = param_grid)

In [None]:
# Use GridSearchCV to optimize tuning of RandomForest parameters

# To record execution time
start_time = time.perf_counter()

# Run grid_search on training data
grid_search.fit(X_train_scaled, y_train_resampled)

end_time = time.perf_counter()
total = end_time - start_time

print(f'Execution time: {total} seconds')

In [None]:
# Print results of GridSearchCV
print(grid_search.best_params_)

Per `GridSearchCV` results, best results for Version 1 + 2 achieved with following parameters:
* `n_estimators`: 200
* `min_samples_leaf`: 4
* `min_samples_split`: 2

Version 3:
* `n_estimators`: 100
* `min_samples_leaf`: 1
* `min_samples_split`: 2

### Using `X_train`, `X_test data`

In [23]:
# Reinitialize RFClassifier using tuned parameters
rf_model = RandomForestClassifier(n_estimators=200, min_samples_leaf=4, min_samples_split=2, random_state=47)

In [24]:
# Train the model
start_time = time.perf_counter()

rf_model = rf_model.fit(X_train_scaled, y_train_resampled)

end_time = time.perf_counter()
total = end_time - start_time

print(f'Execution time: {total} seconds')

Execution time: 4.503977166954428 seconds


In [25]:
# Make predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [26]:
# Display confusion matrix as dataframe
cm2 = confusion_matrix(y_test,rf_predictions)

pd.DataFrame(cm2, index=['Actual 0','Actual 1'],columns=['Predicted 0', 'Predicted 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,14074,49
Actual 1,96,364


In [27]:
# Display classification report for RandomForest model
rf_class_report = classification_report(y_test, rf_predictions)
print(rf_class_report)

              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99     14123
         1.0       0.88      0.79      0.83       460

    accuracy                           0.99     14583
   macro avg       0.94      0.89      0.91     14583
weighted avg       0.99      0.99      0.99     14583



* Version 1: P 0.80, R 0.74, F1 0.77
* Version 2: P 0.88, R 0.79, F1 0.83
* Version 3: 

In [28]:
# Display the importance of the features per the RF model, sorted by least to most important
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5570930731948235, 'NUM_OF_DLQ_MONTHS'),
 (0.040438775128271785, 'FLAG_OWN_REALTY_Y'),
 (0.038786639532852316, 'FLAG_OWN_REALTY_N'),
 (0.032625962548943746, 'CODE_GENDER_M'),
 (0.03240361951306646, 'CODE_GENDER_F'),
 (0.026230697861894227, 'FLAG_OWN_CAR_Y'),
 (0.026204555622133124, 'NUM_OF_MONTHS'),
 (0.026046417011789627, 'FLAG_OWN_CAR_N'),
 (0.02524410468041464, 'NAME_FAMILY_STATUS_Married'),
 (0.01936285307105088, 'CNT_FAM_MEMBERS'),
 (0.01582480993949111, 'NAME_INCOME_TYPE_Working'),
 (0.014888847946185856, 'NAME_EDUCATION_TYPE_Secondary / secondary special'),
 (0.014395278027332009, 'AMT_INCOME_TOTAL'),
 (0.01425330250745193, 'NAME_EDUCATION_TYPE_Higher education'),
 (0.012847946841192517, 'DAYS_BIRTH'),
 (0.012671979319248513, 'NAME_INCOME_TYPE_Commercial associate'),
 (0.011290475421974703, 'OCCUPATION_TYPE_Laborers'),
 (0.010944480007857547, 'NAME_FAMILY_STATUS_Single / not married'),
 (0.010159838293822773, 'DAYS_EMPLOYED'),
 (0.007112837546124727, 'NAME_FAMILY_STATUS_Civil

### Cross Validation

In [None]:
# Define pipeline for Cross-Validation of rf model
pipeline2 = Pipeline([
    ('scaler', StandardScaler()), # Standardize the training data for each fold
    ('classifier', rf_model)]) # use our RF model

# Initialize KFold into 5 splits
kf2 = KFold(n_splits=10, shuffle=True, random_state=47)

In [None]:
# run cross_val_score on the entire dataset. Show total execution time
start_time = time.perf_counter()

rf_cv_scores = cross_val_score(pipeline2, X, y, cv=kf2, scoring= negative)

end_time = time.perf_counter()
total = end_time - start_time

print(f'Execution time: {total} seconds')

In [None]:
rf_cv_scores

## XGBoost

coming soon