In [1]:
import pandas as pd
import numpy as np
# Preparation and preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
# Pipeline
from sklearn.pipeline import Pipeline
# Models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

# Performance evaluation
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, make_scorer
from sklearn.metrics import f1_score, precision_score, recall_score, cohen_kappa_score
from scipy.stats import ks_2samp


In [2]:
df1 = pd.read_csv("datasets/australian_credit.csv")
df2 = pd.read_csv("datasets/GMSC/cs-training.csv")
df3 = pd.read_csv("datasets/german_credit.csv")
df4 = pd.read_csv("datasets/UCI_Credit_Card.csv")

datasets = [df1, df2, df3, df4]

# split each dataset into X and y
Xs = []
ys = []

for dataset in datasets:
    Xs.append(dataset.iloc[:, :-1])
    ys.append(dataset.iloc[:, -1])

ys[2] = ys[2].replace({1: 0, 2: 1})

Xs[3].iloc[:, 2] = Xs[3].iloc[:, 2].replace({1: 0, 2: 1})
to_drop = Xs[3].columns[[0, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]]
Xs[3] = Xs[3].drop(to_drop, axis = 1)

Xs[1].loc[65695, "age"] = Xs[1]["age"].median() # Replace age 0

# Fix late payments vars
Xs[1].loc[:,'NumberOfTime30-59DaysPastDueNotWorse'] = Xs[1].loc[:,'NumberOfTime30-59DaysPastDueNotWorse'].replace(
    {96: Xs[1]["NumberOfTime30-59DaysPastDueNotWorse"].median(),
        98: Xs[1]["NumberOfTime30-59DaysPastDueNotWorse"].median()})
Xs[1].loc[:,"NumberOfTime60-89DaysPastDueNotWorse"] = Xs[1].loc[:,"NumberOfTime60-89DaysPastDueNotWorse"].replace(
    {96: Xs[1]["NumberOfTime60-89DaysPastDueNotWorse"].median(),
     98: Xs[1]["NumberOfTime60-89DaysPastDueNotWorse"].median()})
Xs[1].loc[:,"NumberOfTimes90DaysLate"] = Xs[1].loc[:,"NumberOfTimes90DaysLate"].replace(
    {96: Xs[1]["NumberOfTimes90DaysLate"].median(),
     98: Xs[1]["NumberOfTimes90DaysLate"].median()})

# Looking for categorical and binary variables
categorical_columns = [[], [], [], []]
binary_columns = [[], [], [], []]
for i, X in enumerate(Xs):
    for j in range(X.shape[1]):
        x = X.iloc[:, j].unique()
        if len(x) < 13 and len(x) > 2:
            categorical_columns[i].append(j)
        elif len(x) == 2:
            binary_columns[i].append(j)

# Find numeric features
not_numeric_columns = [[], [], [], []]
for i in range(4):
    not_numeric_columns[i] = categorical_columns[i] + binary_columns[i]
numeric_columns_index = [[], [], [], []]
for i, X in enumerate(Xs):
    numeric_columns_index[i] = [x for x in range(
        X.shape[1]) if x not in not_numeric_columns[i]]


Xs[2].iloc[:, binary_columns[2]] = pd.get_dummies(
    Xs[2].iloc[:, binary_columns[2]], drop_first=True, dtype=int).iloc[:, [1, 0, 2, 3]]

Xs[2].iloc[:, 17] = Xs[2].iloc[:, 17].replace({1: 0, 2: 1})

In [None]:
# Transforming the data
ct0 = ColumnTransformer([
    ("standardised", StandardScaler(), ['A2']),
    ("robust", RobustScaler(), ['A3', 'A5', 'A7', 'A10', 'A13', 'A14']),
    ("categorical", OneHotEncoder(
        handle_unknown='ignore'), ['A4', 'A6', 'A12'])
])

impute_and_scale = Pipeline([
    ("imputer", SimpleImputer(strategy='median')),
    ("scaler", RobustScaler())
])

ct1 = ColumnTransformer([
    ("impute_and_scale", impute_and_scale, ['MonthlyIncome', 'NumberOfDependents']),
    ("standardised", StandardScaler(), ["age"]),
    ("robust", RobustScaler(), ['RevolvingUtilizationOfUnsecuredLines',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio',
       'NumberOfOpenCreditLinesAndLoans',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfTimes90DaysLate'])
])

to_dense_transformer = FunctionTransformer(lambda x: x.toarray(), accept_sparse=True) # Convert sparse matrix to dense
ct2 = ColumnTransformer([
    ("standardised", StandardScaler(), ["Age"]),
    ("robust", RobustScaler(), ['Duration', 'Credit_amount']),
    ("categorical", OneHotEncoder(
        handle_unknown='ignore'), categorical_columns[2])
])

ct3 = ColumnTransformer([
    ("standardised", StandardScaler(), ["AGE"]),
    ("robust", RobustScaler(), ['LIMIT_BAL', 'BILL_AMT1', 'PAY_AMT1',
                                'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
                                'PAY_AMT6']),
    ("categorical", OneHotEncoder(
        handle_unknown='ignore'), ['EDUCATION', 'MARRIAGE', 'PAY_0'])
])

In [4]:
skf_inner = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf_outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=43)

In [None]:
# SVM pipelines
Pipeline0_svm = Pipeline(
    steps=[
        ("preprocessor", ct0),
        ("classifier", SVC(C=10, max_iter=10000, probability=True, random_state=7)),
    ]
)
Pipeline1_svm = Pipeline(
    steps=[
        ("preprocessor", ct1),
        ("classifier", LinearSVC(C=1, max_iter=10000, random_state=7)),
    ]
)
Pipeline2_svm = Pipeline(
    steps=[
        ("preprocessor", ct2),
        ("classifier", SVC(C=10, max_iter=10000, probability=True, random_state=7)),
    ]
)
Pipeline3_svm = Pipeline(
    steps=[
        ("preprocessor", ct3),
        ("classifier", LinearSVC(C=0.1, max_iter=10000, random_state=7)),
    ]
)
# dt pipelines
Pipeline0_dt = Pipeline(
    steps=[
        ("preprocessor", ct0),
        (
            "classifier",
            DecisionTreeClassifier(max_depth=10, max_features="sqrt", min_samples_leaf=6, random_state=7),
        ),
    ]
)
Pipeline1_dt = Pipeline(
    steps=[
        ("preprocessor", ct1),
        (
            "classifier",
            DecisionTreeClassifier(max_features="sqrt", min_samples_leaf=4, random_state=7),
        ),
    ]
)
Pipeline2_dt = Pipeline(
    steps=[
        ("preprocessor", ct2),
        (
            "classifier",
            DecisionTreeClassifier(max_depth=5, max_features="sqrt", min_samples_leaf=4, random_state=7),
        ),
    ]
)
Pipeline3_dt = Pipeline(
    steps=[
        ("preprocessor", ct3),
        (
            "classifier",
            DecisionTreeClassifier(max_depth=10, max_features="sqrt", random_state=7),
        ),
    ]
)

# Knn pipelines
Pipeline0_knn = Pipeline(
    steps=[
        ("preprocessor", ct0),
        ("classifier", KNeighborsClassifier(n_neighbors=15, weights="distance")),
    ]
)
Pipeline1_knn = Pipeline(
    steps=[
        ("preprocessor", ct1),
        (
            "classifier",
            KNeighborsClassifier(leaf_size=40, n_neighbors=3, weights="distance"),
        ),
    ]
)
Pipeline2_knn = Pipeline(
    steps=[
        ("preprocessor", ct2),
        ("classifier", KNeighborsClassifier(weights="distance")),
    ]
)
Pipeline3_knn = Pipeline(
    steps=[
        ("preprocessor", ct3),
        (
            "classifier",
            KNeighborsClassifier(leaf_size=40, n_neighbors=10, weights="distance"),
        ),
    ]
)

# RF pipelines
Pipeline0_rf = Pipeline(
    steps=[
        ("preprocessor", ct0),
        (
            "classifier",
            RandomForestClassifier(max_depth=20, max_features="log2", min_samples_split=10, random_state=7),
        ),
    ]
)
Pipeline1_rf = Pipeline(
    steps=[
        ("preprocessor", ct1),
        (
            "classifier",
            RandomForestClassifier(max_depth=20, max_features="log2", min_samples_split=5, random_state=7),
        ),
    ]
)
Pipeline2_rf = Pipeline(
    steps=[
        ("preprocessor", ct2),
        ("classifier", RandomForestClassifier(random_state=7)),
    ]
)
Pipeline3_rf = Pipeline(
    steps=[
        ("preprocessor", ct3),
        (
            "classifier",
            RandomForestClassifier(max_depth=30, min_samples_split=5, random_state=7),
        ),
    ]
)

# LR pipelines
Pipeline0_lr = Pipeline(
    steps=[
        ("preprocessor", ct0),
        ("classifier", LogisticRegression(C=1, solver="liblinear")),
    ]
)
Pipeline1_lr = Pipeline(
    steps=[
        ("preprocessor", ct1),
        ("classifier", LogisticRegression(C=1, solver="newton-cg")),
    ]
)
Pipeline2_lr = Pipeline(
    steps=[
        ("preprocessor", ct2),
        ("classifier", LogisticRegression(C=1, solver="newton-cg")),
    ]
)
Pipeline3_lr = Pipeline(
    steps=[
        ("preprocessor", ct3),
        ("classifier", LogisticRegression(C=1, solver="newton-cg")),
    ]
)

# MLP pipelines
Pipeline0_mlp = Pipeline(
    steps=[
        ("preprocessor", ct0),
        (
            "classifier",
            MLPClassifier(
                early_stopping=True,
                hidden_layer_sizes=(200, 100),
                max_iter=2000,
                random_state=7,
            ),
        ),
    ]
)
Pipeline1_mlp = Pipeline(
    steps=[
        ("preprocessor", ct1),
        (
            "classifier",
            MLPClassifier(
                activation="logistic",
                early_stopping=True,
                hidden_layer_sizes=(200,),
                max_iter=2000,
                random_state=7,
            ),
        ),
    ]
)
Pipeline2_mlp = Pipeline(
    steps=[
        ("preprocessor", ct2),
        (
            "classifier",
            MLPClassifier(
                activation="tanh",
                hidden_layer_sizes=(200, 100),
                max_iter=2000,
                random_state=7,
                solver="lbfgs",
            ),
        ),
    ]
)
Pipeline3_mlp = Pipeline(
    steps=[
        ("preprocessor", ct3),
        (
            "classifier",
            MLPClassifier(early_stopping=True, max_iter=2000, random_state=7),
        ),
    ]
)

# lda pipelines
Pipeline0_lda = Pipeline(steps=[("preprocessor", ct0), ("classifier", LinearDiscriminantAnalysis())])
Pipeline1_lda = Pipeline(steps=[("preprocessor", ct1), ("classifier", LinearDiscriminantAnalysis())])
Pipeline2_lda = Pipeline(
    steps=[
        ("preprocessor", ct2),
        ("to_dense", to_dense_transformer),
        ("classifier", LinearDiscriminantAnalysis()),
    ]
)
Pipeline3_lda = Pipeline(steps=[("preprocessor", ct3), ("classifier", LinearDiscriminantAnalysis())])


In [9]:
# Final training and tests

def ks_statistic(y_true, y_pred_prob):
    y_true = y_true.astype(int)
    return ks_2samp(y_pred_prob[y_true == 1], y_pred_prob[y_true == 0]).statistic


ks_scorer = make_scorer(ks_statistic, response_method='predict_proba')


metrics = {'accuracy': 'accuracy', 'precision': 'precision',
           'recall': 'recall', 'ks': ks_scorer, "kappa": make_scorer(cohen_kappa_score), 'roc_auc': 'roc_auc'}

def cv_pipelines(pipelines, X, y, metrics, cv):
    results = []
    result_means = []
    for pipeline in pipelines:
        result = cross_validate(
            pipeline, X, y, scoring=metrics, cv=cv, return_train_score=True, n_jobs=-1)
        results.append(pd.DataFrame(result))
        result_means.append(np.mean(pd.DataFrame(result), axis=0))
    return (results, result_means)
    
pipelines = {
0 : [Pipeline0_lr, Pipeline0_rf, Pipeline0_lda, Pipeline0_knn,
                Pipeline0_mlp, Pipeline0_dt, Pipeline0_svm],
1 : [Pipeline1_lr, Pipeline1_rf, Pipeline1_lda, Pipeline1_knn,
                  Pipeline1_mlp, Pipeline1_dt],
2 : [Pipeline2_lr, Pipeline2_rf, Pipeline2_lda, Pipeline2_knn,
                           Pipeline2_mlp, Pipeline2_dt, Pipeline2_svm],
3 : [Pipeline3_lr, Pipeline3_rf, Pipeline3_lda, Pipeline3_knn,
                             Pipeline3_mlp, Pipeline3_dt]
}

result0, result_mean0 = cv_pipelines(pipelines[0], Xs[0], ys[0], metrics, cv=skf_outer)
result1, result_mean1 = cv_pipelines(pipelines[1], Xs[1], ys[1], metrics, cv=skf_outer)
result2, result_mean2 = cv_pipelines(pipelines[2], Xs[2], ys[2], metrics, cv=skf_outer)
result3, result_mean3 = cv_pipelines(pipelines[3], Xs[3], ys[3], metrics, cv=skf_outer)

pd.concat([pd.DataFrame(result_mean0),
           pd.DataFrame(result_mean1),
           pd.DataFrame(result_mean2),
           pd.DataFrame(result_mean3)]).to_csv('consolidated_results.csv')


In [10]:
def ks_statistic(y_true, y_pred_prob):
    y_true = y_true.astype(int)
    return ks_2samp(y_pred_prob[y_true == 1], y_pred_prob[y_true == 0]).statistic


ks_scorer = make_scorer(ks_statistic, response_method='decision_function')

metrics = {'accuracy': 'accuracy', 'precision': 'precision',
           'recall': 'recall', 'ks': ks_scorer, "kappa": make_scorer(cohen_kappa_score), 'roc_auc': 'roc_auc'}

svm1_res = cross_validate(Pipeline1_svm, Xs[1], ys[1], scoring=metrics, cv=skf_outer, return_train_score=True, n_jobs=-1)
svm3_res = cross_validate(Pipeline3_svm, Xs[3], ys[3], scoring=metrics, cv=skf_outer, return_train_score=True, n_jobs=-1)


In [11]:
pd.concat([pd.DataFrame(svm1_res).mean(axis=0), pd.DataFrame(svm3_res).mean(axis=0)]).to_excel("svm.xlsx")

In [12]:
from scipy.stats import friedmanchisquare
ac_ranks = [3.55,1.45,5.55,5.45,2.7,6.7,2.6]
gmsc_ranks = [5.4,2.4,4.2,4.85,3.1,2.95,3.85]
gc_ranks = [2.7,2.8,2.9,5.45,4.35,7,2.8]
tc_ranks = [3.2,2.05,3.6,6.7,3.25,6.3,2.9]

res = friedmanchisquare(ac_ranks, gmsc_ranks, gc_ranks, tc_ranks)
if res.pvalue < 0.05:
    print(f"The X2 statistic is {res.statistic:.2f} with a p value = {res.pvalue:.2f}, we reject H0")
else:
    print(f"The X2 statistic is {res.statistic:.2f} with a p value = {res.pvalue:.2f}, we fail to reject H0")

The X2 statistic is 0.48 with a p value = 0.92, we fail to reject H0
