In [53]:
import os
import json
import zipfile
import gzip
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    precision_score, recall_score, f1_score, balanced_accuracy_score
)
from sklearn.metrics import confusion_matrix


In [54]:
def load_data_train():
    with zipfile.ZipFile('../files/input/train_data.csv.zip', 'r') as file:
        # print(file.namelist())
        with file.open('train_default_of_credit_card_clients.csv') as f:
            data = pd.read_csv(f)
    return data

def clean_data(data):
    data = data.rename(columns={'default payment next month': 'default'})
    data = data.drop(columns='ID')
    data = data.dropna()
    data.loc[data['EDUCATION'] > 4, 'EDUCATION'] = 4
    return data


data_train = load_data_train()
data_train = clean_data(data_train)
data_train.head()


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,310000,1,3,1,32,0,0,0,0,0,...,84373,57779,14163,8295,6000,4000,3000,1000,2000,0
1,10000,2,3,1,49,-1,-1,-2,-1,2,...,1690,1138,930,0,0,2828,0,182,0,1
2,50000,1,2,1,28,-1,-1,-1,0,-1,...,45975,1300,43987,0,46257,2200,1300,43987,1386,0
3,80000,2,3,1,52,2,2,3,3,3,...,40748,39816,40607,3700,1600,1600,0,1600,1600,1
4,270000,1,1,2,34,1,2,0,0,2,...,22448,15490,17343,0,4000,2000,0,2000,2000,0


In [55]:
def load_data_test():
    with zipfile.ZipFile('../files/input/test_data.csv.zip', 'r') as file:
        # print(file.namelist())
        with file.open('test_default_of_credit_card_clients.csv') as f:
            data = pd.read_csv(f)
    return data

def clean_data(data):
    data = data.rename(columns={'default payment next month': 'default'})
    data = data.drop(columns='ID')
    data = data.dropna()
    data.loc[data['EDUCATION'] > 4, 'EDUCATION'] = 4
    return data

data_test = load_data_test()
data_test = clean_data(data_test)
data_test.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default
0,120000,2,2,2,26,-1,2,0,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
1,20000,1,3,2,35,-2,-2,-2,-2,-1,...,0,13007,13912,0,0,0,13007,1122,0,0
2,200000,2,3,2,34,0,0,2,0,0,...,2513,1828,3731,2306,12,50,300,3738,66,0
3,250000,1,1,2,29,0,0,0,0,0,...,59696,56875,55512,3000,3000,3000,3000,3000,3000,0
4,50000,2,3,3,23,1,2,0,0,0,...,28771,29531,30211,0,1500,1100,1200,1300,1100,0


In [56]:
def split_data(data):
    x = data.drop(columns='default')
    y = data['default']
    return x, y

x_train, y_train = split_data(data_train)
x_test, y_test = split_data(data_test)

In [57]:
def create_pipeline():
    categorical = ["SEX", "EDUCATION", "MARRIAGE"]
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(), categorical)
            ],
        remainder='passthrough'
            )
    model_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
        ])
    
    return model_pipeline

model_pipeline = create_pipeline()

In [58]:
def optimize_hyperparameters(model, x, y):
    param_grid = {
        'classifier__n_estimators': [200, 300, 500],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 2, 4],
        'classifier__max_features': ['sqrt', 'log2']
    }

    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=10,
        scoring="balanced_accuracy",
        n_jobs=-1,
        verbose=2,
        refit=True
    )

    grid_search.fit(x, y)

    return grid_search

grid_search = optimize_hyperparameters(model_pipeline, x_train, y_train)

Fitting 10 folds for each of 162 candidates, totalling 1620 fits


In [61]:
def save_model(model):
    os.makedirs('../files/models', exist_ok=True)
    with gzip.open('../files/models/model.pkl.gz', 'wb') as file:
        pickle.dump(model, file)

save_model(grid_search)

In [62]:
y_train_pred = grid_search.predict(x_train)
y_test_pred = grid_search.predict(x_test)

def metrics_calc(y_true, y_pred, dataset):
        return {
            'type': 'metrics',
            'dataset': dataset,
            'precision': precision_score(y_true, y_pred),
            'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
            'recall': recall_score(y_true, y_pred),
            'f1_score': f1_score(y_true, y_pred)
        }

def matrix_calc(y_true, y_pred, dataset):
    cm = confusion_matrix(y_true, y_pred)
    return {
        'type': 'cm_matrix',
        'dataset': dataset,
        'true_0': {"predicted_0": int(cm[0, 0]), "predicted_1": int(cm[0, 1])},
        'true_1': {"predicted_0": int(cm[1, 0]), "predicted_1": int(cm[1, 1])}
    }
metrics = [
    metrics_calc(y_train, y_train_pred, 'train'),
    metrics_calc(y_test, y_test_pred, 'test'),
    matrix_calc(y_train, y_train_pred, 'train'),
    matrix_calc(y_test, y_test_pred, 'test')
]
with open("../files/output/metrics.json", "w") as f:
    for metric in metrics:
        f.write(json.dumps(metric) + "\n")