In [None]:
import os

%matplotlib inline
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer,
    plot_confusion_matrix,
)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC

# Credit Card Payment Default


## The Problem
***

**In class we worked with a Default of Credit Card Clients Dataset to estimate whether a person will default their credit card bills. With the same goal in mind, I will be looking to improve the f1 score of models I have previously created using different techniques. Models with higher f1 scores will be better at detecting false megative cases. In this dataset, false negative cases occur when the model classifies that someone will not default payment when they will. We want to detect false negative cases as it is more detrimental to the credit card company since there is potential to lose money.**

<br><br>

## Data Preprocessing

***

In [None]:
# reading dataset from CSV file
cc_df = pd.read_csv("data/UCI_Credit_Card.csv")

In [None]:
# splitting of data into train and test portions

X = cc_df.drop(columns = ["default.payment.next.month", "ID"])
y = cc_df["default.payment.next.month"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=123
)

<br><br>

In [None]:
cc_df.describe()

**As I look through the features of the data set, ID feature could be unhelpful in predicting the outcome. Data seems complete as there are no unknown values so no imputation will be needed. It would be interesting to see how big of a factor marriage, age and sex is in determining whether someone will default their payment.**

<br><br>

In [None]:
pay = cc_df[cc_df['default.payment.next.month'] == 1]

nopay = cc_df[cc_df['default.payment.next.month'] == 0]

print("pay: " + str(pay.shape))
print("nopay: " + str(nopay.shape))

**From the information given above, we can see that there is a class imbalance of the target class of 23364 who did not default and 6636 of who defaulted.**
<br><br>

In [None]:
features = []

for col in cc_df.columns:
    features.append(col)

figures = dict()

for feature in features:
    plt.hist(nopay[feature], alpha=1, bins=50, label="0")
    plt.hist(pay[feature], alpha=0.5, bins=50, label="1")
    plt.legend(loc="best")

    plt.xlabel(feature)
    plt.ylabel("count")
    plt.title(f"Histogram of {feature} by target class")
    plt.show()

**LIMIT_BAL and AGE seems like a relevant feature for the given prediction task.**

In [None]:
cc_df.plot(
    kind='scatter',
    x='AGE',
    y='PAY_AMT1',
    c='LIMIT_BAL',
    cmap=plt.get_cmap("jet"),
    figsize=(10,10),
    colorbar=True
)

**This might suggest that middle-age people (30-50) have higher LIMIT_BAL which in turn will have higher payments.**


## METRICS

**It is more detrimental to the credit card company to not be able to detect when people are going to default; that is when we are interested in false negatives. Due to class imbalances, we will be using F1, precision and recall scores instead of accuracy. Not being able to detect people who will default increases the opportunity cost of the company, thus we should be trying to optimize recall scores instead of precision scores.**

## Preprocessing and Transformations

***

In [None]:
categorical_feats = [
    'PAY_0',
    'PAY_2',
    'PAY_3',
    'PAY_4',
    'PAY_5',
    'PAY_6',
    'MARRIAGE',
    'EDUCATION'
]

numeric_feats = [
    'LIMIT_BAL',
    'AGE',
    'BILL_AMT1',
    'BILL_AMT2',
    'BILL_AMT3',
    'BILL_AMT4',
    'BILL_AMT5',
    'BILL_AMT6',
    'PAY_AMT1',
    'PAY_AMT2',
    'PAY_AMT3',
    'PAY_AMT4',
    'PAY_AMT5',
    'PAY_AMT6',
]
    
binary_feats = [
    'SEX'
]

In [None]:
numeric_transformer = make_pipeline(StandardScaler())


categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=False),
)

binary_transformer = make_pipeline(
    OneHotEncoder(drop="if_binary", dtype=int),
)


preprocessor = make_column_transformer(
    (StandardScaler(), numeric_feats), 
    (OneHotEncoder(drop="if_binary", dtype=int), binary_feats),    
    (OneHotEncoder(handle_unknown="ignore", sparse=False), categorical_feats),
)

transformed = preprocessor.fit_transform(X_train, y_train)

## Baseline Model

***

In [None]:
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores[i], std_scores[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

In [None]:
scoring = [
    "f1",
    "recall",
    "accuracy"
]

In [None]:
results_dict = {}

dummy = DummyClassifier(strategy="prior")
pipe = make_pipeline(preprocessor, dummy)
results_dict["dummy"] = mean_std_cross_val_scores(
    pipe, X_train, y_train, cv=5, return_train_score=True, scoring = scoring
)
pd.DataFrame(results_dict)

## Linear Models

***

In [None]:
# first attempt 

lr = LogisticRegression(max_iter=1000, class_weight='balanced')
scores = cross_validate(lr, X_train, y_train, return_train_score=True, scoring=scoring)
pd.DataFrame(scores).describe()

In [None]:
pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=1000, class_weight='balanced'))

param_grid = {
    "logisticregression__C": 2.0 ** np.arange(-4, 4)
}

search = GridSearchCV(
    pipe,
    param_grid,
    verbose=1,
    n_jobs=-1,
    return_train_score=True,
    scoring="f1",
)
search.fit(X_train, y_train);

In [None]:
grid_results_df = pd.DataFrame(search.cv_results_)[
    [
        "mean_test_score",
        "mean_train_score",
        "param_logisticregression__C",
        "rank_test_score",
    ]
]
grid_results_df = grid_results_df.sort_values(by="mean_test_score", ascending=False)
grid_results_df

In [None]:
grid_results_df.describe()


**We use f1 scores for testing in order to counteract class imbalances. From the results above, we can see that the mean f1 scores is 0.532150 with a std of +/-0.000679. If we compare that to the dummy classifier(f1 = 0) and first attempt(f1 = 0.419531), the optimized logistic regression model is significantly better. Given the chart above, we can see that C = 0.0625 gives the best f1 score of 0.533144.**

## Different Classifiers

***

In [None]:
from catboost import CatBoostClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

pipe_lr = make_pipeline(
    preprocessor, LogisticRegression(max_iter=2000, random_state=123)
)
pipe_dt = make_pipeline(preprocessor, DecisionTreeClassifier(random_state=123))
pipe_rf = make_pipeline(preprocessor, RandomForestClassifier(random_state=123))
pipe_xgb = make_pipeline(
    preprocessor, XGBClassifier(random_state=123, eval_metric="logloss", verbosity=0)
)
pipe_lgbm = make_pipeline(preprocessor, LGBMClassifier(random_state=123))
pipe_catboost = make_pipeline(
    preprocessor, CatBoostClassifier(verbose=0, random_state=123)
)
classifiers = {
    "logistic regression": pipe_lr,
    "decision tree": pipe_dt,
    "random forest": pipe_rf,
    "XGBoost": pipe_xgb,
    "LightGBM": pipe_lgbm,
    "CatBoost": pipe_catboost,
}

In [None]:
results = {}
import warnings

warnings.simplefilter(action="ignore", category=DeprecationWarning)
warnings.simplefilter(action="ignore", category=UserWarning)

In [None]:
for (name, model) in classifiers.items():
    results[name] = mean_std_cross_val_scores(
        model, X_train, y_train, return_train_score=True, scoring=scoring
    )

In [None]:
pd.DataFrame(results).T

## Hyperparameter Optimization

***

In [None]:
lgbm_pipe = make_pipeline(preprocessor, LGBMClassifier(random_state=123))

param_grid = {
    "lgbmclassifier__max_depth": 10*np.arange(1, 50, 10),
    "lgbmclassifier__num_leaves": 2**np.arange(1, 50, 10),
}

lgbm_search = GridSearchCV(
    lgbm_pipe,
    param_grid,
    cv = 5,
    n_jobs=-1,
    return_train_score=True,
    scoring="f1",
)
lgbm_search.fit(X_train, y_train)

In [None]:
lgbm_search.best_params_

In [None]:
lgbm_search.best_score_

**The unoptimized model(f1 = 0.480 performed better than its optimized model(f1 = 0.47148175331599296); this may be due to randomness or luck. However, LightGBM in general performed worse than logistic regression model(f1 = 0.533144). This suggests that logistic regression may be the best model we have to accurately predict our target value.**

## Interpretation of Feature Importances
***

In [None]:
best_pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=1000, class_weight='balanced', C = 0.0625))
best_pipe.fit(X_train, y_train)

coeffs = best_pipe.named_steps["logisticregression"].coef_.flatten()

In [None]:
preprocessor.named_transformers_

In [None]:
new_columns = (
    numeric_feats
    + binary_feats
    + list(
        preprocessor.named_transformers_["onehotencoder-2"].get_feature_names(
            categorical_feats
        )
    )
)

In [None]:
new_columns = (
    numeric_feats
    + binary_feats
    + list(
        preprocessor.named_transformers_["onehotencoder-2"].get_feature_names(
            categorical_feats
        )
    )
)
features = pd.DataFrame(coeffs, index=new_columns, columns=["Coefficient"])
features.sort_values(by = "Coefficient", ascending=False)

In [None]:
import mglearn
mglearn.tools.visualize_coefficients(coeffs, new_columns, n_top_features=10)

**The top 3 features with biggest coefficients: Pay_0_2, Pay_0_3, Education_2.**

**The bottom 3 features with smallest coefficients: Pay_0_0, Pay_0_2, Education_5.**

**These features play the biggest factors in determining whether someone will default their payment or not. Pay_0_0 represents whether someone will pay their entire balance for that month on time. That makes sense since people who will pay their statement on time will not default their payment. Pay_0_2 represents whether someone will delay payment for two months. This suggests that people will most likely default their payment if they have delayed their payment for 2 months. Overall, the top and bottom features make sense on why they have their respective coefficients.**

In [None]:
search.score(X_test, y_test)

**1. Final test Score: 0.5391891891891891, F1 Score: 0.532150**

**2. My initial assumptions was that marriage and age would play a role in whether someone would default payment. However, the model seems to suggest otherwise. The final test score seems really low and I would not deploy this model in real life.**

**3. Ways to improve performance/interpretability:**

**Research and collect features that should play a role in whether payments would be defaulted
Find the best hyper parameters for the all the models used and then compared F1 scores
Try removing features that have small coefficients(magnitude wise)**