In [1]:
import pandas as pd
import numpy as np
# Preparation and preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
# Pipeline
from sklearn.pipeline import Pipeline
# Models
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC

# Performance evaluation
from sklearn.model_selection import cross_val_score, StratifiedKFold, cross_validate, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, make_scorer
from sklearn.metrics import f1_score, precision_score, recall_score, cohen_kappa_score
from scipy.stats import ks_2samp


In [3]:
df1 = pd.read_csv("datasets/australian_credit.csv")
df2 = pd.read_csv("datasets/GMSC/cs-training.csv")
df3 = pd.read_csv("datasets/german_credit.csv")
df4 = pd.read_csv("datasets/UCI_Credit_Card.csv")

datasets = [df1, df2, df3, df4]

# split each dataset into X and y
Xs = []
ys = []

for dataset in datasets:
    Xs.append(dataset.iloc[:, :-1])
    ys.append(dataset.iloc[:, -1])

ys[2] = ys[2].replace({1: 0, 2: 1})

Xs[3].iloc[:, 2] = Xs[3].iloc[:, 2].replace({1: 0, 2: 1})
to_drop = Xs[3].columns[[0, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]]
Xs[3] = Xs[3].drop(to_drop, axis = 1)

Xs[1].loc[65695, "age"] = Xs[1]["age"].median() # Replace age 0

# Fix late payments vars
Xs[1].loc[:,'NumberOfTime30-59DaysPastDueNotWorse'] = Xs[1].loc[:,'NumberOfTime30-59DaysPastDueNotWorse'].replace(
    {96: Xs[1]["NumberOfTime30-59DaysPastDueNotWorse"].median(),
        98: Xs[1]["NumberOfTime30-59DaysPastDueNotWorse"].median()})
Xs[1].loc[:,"NumberOfTime60-89DaysPastDueNotWorse"] = Xs[1].loc[:,"NumberOfTime60-89DaysPastDueNotWorse"].replace(
    {96: Xs[1]["NumberOfTime60-89DaysPastDueNotWorse"].median(),
     98: Xs[1]["NumberOfTime60-89DaysPastDueNotWorse"].median()})
Xs[1].loc[:,"NumberOfTimes90DaysLate"] = Xs[1].loc[:,"NumberOfTimes90DaysLate"].replace(
    {96: Xs[1]["NumberOfTimes90DaysLate"].median(),
     98: Xs[1]["NumberOfTimes90DaysLate"].median()})

# Looking for categorical and binary variables
categorical_columns = [[], [], [], []]
binary_columns = [[], [], [], []]
for i, X in enumerate(Xs):
    for j in range(X.shape[1]):
        x = X.iloc[:, j].unique()
        if len(x) < 13 and len(x) > 2:
            categorical_columns[i].append(j)
        elif len(x) == 2:
            binary_columns[i].append(j)

# Find numeric features
not_numeric_columns = [[], [], [], []]
for i in range(4):
    not_numeric_columns[i] = categorical_columns[i] + binary_columns[i]
numeric_columns_index = [[], [], [], []]
for i, X in enumerate(Xs):
    numeric_columns_index[i] = [x for x in range(
        X.shape[1]) if x not in not_numeric_columns[i]]


Xs[2].iloc[:, binary_columns[2]] = pd.get_dummies(
    Xs[2].iloc[:, binary_columns[2]], drop_first=True, dtype=int).iloc[:, [1, 0, 2, 3]]

Xs[2].iloc[:, 17] = Xs[2].iloc[:, 17].replace({1: 0, 2: 1})

In [4]:
# Transforming the data
ct0 = ColumnTransformer([
    ("standardised", StandardScaler(), ['A2']),
    ("robust", RobustScaler(), ['A3', 'A5', 'A7', 'A10', 'A13', 'A14']),
    ("categorical", OneHotEncoder(
        handle_unknown='ignore'), ['A4', 'A6', 'A12'])
])

impute_and_scale = Pipeline([
    ("imputer", SimpleImputer(strategy='median')),
    ("scaler", RobustScaler())
])

ct1 = ColumnTransformer([
    ("impute_and_scale", impute_and_scale, ['MonthlyIncome', 'NumberOfDependents']),
    ("standardised", StandardScaler(), ["age"]),
    ("robust", RobustScaler(), ['RevolvingUtilizationOfUnsecuredLines',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio',
       'NumberOfOpenCreditLinesAndLoans',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfTimes90DaysLate'])
])

to_dense_transformer = FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)
ct2 = ColumnTransformer([
    ("standardised", StandardScaler(), ["Age"]),
    ("robust", RobustScaler(), ['Duration', 'Credit_amount']),
    ("categorical", OneHotEncoder(
        handle_unknown='ignore'), categorical_columns[2])
])

ct3 = ColumnTransformer([
    ("standardised", StandardScaler(), ["AGE"]),
    ("robust", RobustScaler(), ['LIMIT_BAL', 'BILL_AMT1', 'PAY_AMT1',
                                'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
                                'PAY_AMT6']),
    ("categorical", OneHotEncoder(
        handle_unknown='ignore'), ['EDUCATION', 'MARRIAGE', 'PAY_0'])
])

Set stratified kfold partitions for testing and validation

In [7]:
skf_inner = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
skf_outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=43)

For mlp do a random grid search

In [5]:
# MLP tuning
param_grid = {
    'classifier__hidden_layer_sizes': [(100,), (200,), (200, 100, ), (100, 50, )],
    'classifier__activation': ['logistic', 'tanh', 'relu'],
    'classifier__max_iter': [2000, 2500]
}

In [8]:
Pipeline1_mlp = Pipeline(steps=[("preprocessor", ct1), ("classifier",
                                                        MLPClassifier(random_state=7, early_stopping=True))])
gmsc_mlp_hp = RandomizedSearchCV(Pipeline1_mlp, param_grid, cv=skf_inner, scoring="f1", n_jobs=-1).fit(Xs[1], ys[1])
print(gmsc_mlp_hp.best_estimator_)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('impute_and_scale',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   RobustScaler())]),
                                                  ['MonthlyIncome',
                                                   'NumberOfDependents']),
                                                 ('standardised',
                                                  StandardScaler(), ['age']),
                                                 ('robust', RobustScaler(),
                                                  ['RevolvingUtilizationOfUnsecuredLines',
                                                   'NumberOfTime30-59DaysPastDueNotWorse',
         

In [9]:
Pipeline0_mlp = Pipeline(steps=[("preprocessor", ct0), ("classifier",
                                                        MLPClassifier(random_state=7, early_stopping=True))])
ac_mlp_hp = GridSearchCV(Pipeline0_mlp, param_grid, cv=skf_inner, scoring="f1", n_jobs=-1).fit(Xs[0], ys[0])
print(ac_mlp_hp.best_estimator_)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('standardised',
                                                  StandardScaler(), ['A2']),
                                                 ('robust', RobustScaler(),
                                                  ['A3', 'A5', 'A7', 'A10',
                                                   'A13', 'A14']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['A4', 'A6', 'A12'])])),
                ('classifier',
                 MLPClassifier(early_stopping=True,
                               hidden_layer_sizes=(200, 100), max_iter=2000,
                               random_state=7))])


In [10]:
Pipeline2_mlp = Pipeline(steps=[("preprocessor", ct2), ("classifier",
                                                        MLPClassifier(random_state=7, early_stopping=True))])
gc_mlp_hp = GridSearchCV(Pipeline2_mlp, param_grid, cv=skf_inner, scoring="f1", n_jobs=-1).fit(Xs[2], ys[2])
print(gc_mlp_hp.best_estimator_)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('standardised',
                                                  StandardScaler(), ['Age']),
                                                 ('robust', RobustScaler(),
                                                  ['Duration',
                                                   'Credit_amount']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  [0, 2, 3, 5, 6, 7, 9, 10, 11,
                                                   13, 14, 15, 16])])),
                ('classifier',
                 MLPClassifier(activation='tanh', early_stopping=True,
                               hidden_layer_sizes=(200, 100), max_iter=2000,
                               random_state=7))])


In [11]:
Pipeline3_mlp = Pipeline(steps=[("preprocessor", ct3), ("classifier",
                                                        MLPClassifier(random_state=7, early_stopping=True))])
tc_mlp_hp = RandomizedSearchCV(Pipeline3_mlp, param_grid, cv=skf_inner, scoring="f1", n_jobs=-1).fit(Xs[3], ys[3])
print(tc_mlp_hp.best_estimator_)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('standardised',
                                                  StandardScaler(), ['AGE']),
                                                 ('robust', RobustScaler(),
                                                  ['LIMIT_BAL', 'BILL_AMT1',
                                                   'PAY_AMT1', 'PAY_AMT2',
                                                   'PAY_AMT3', 'PAY_AMT4',
                                                   'PAY_AMT5', 'PAY_AMT6']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['EDUCATION', 'MARRIAGE',
                                                   'PAY_0'])])),
                ('classifier',
                 MLPClassifier(early_stopping=True, max_iter=2000,
                               random_state=7))])

Using the best parameters from the randomized search, configure the MLPs

In [None]:
Pipeline0_mlp = Pipeline(steps=[("preprocessor", ct0), ("classifier",
                                                         MLPClassifier(early_stopping=True,
                                                                        hidden_layer_sizes=(200, 100), max_iter=2000,
                                                                        random_state=7))])
Pipeline1_mlp = Pipeline(steps=[("preprocessor", ct1), ("classifier",
                                                        MLPClassifier(activation='logistic', early_stopping=True,
                                                                        hidden_layer_sizes=(200,), max_iter=2000,
                                                                        random_state=7))])
Pipeline2_mlp = Pipeline(steps=[("preprocessor", ct2), ("classifier",
                                                        MLPClassifier(activation='tanh', hidden_layer_sizes=(200, 100),
                                                                        max_iter=2000, random_state=7,
                                                                        solver='lbfgs'))])
Pipeline3_mlp = Pipeline(steps=[("preprocessor", ct3), ("classifier",
                                                        MLPClassifier(early_stopping=True, max_iter=2000,
                                                                        random_state=7))])

Tuning LogisticRegression

In [12]:
Pipeline0_lr = Pipeline(steps=[("preprocessor", ct0), ("classifier",
                                                       LogisticRegression())])
Pipeline1_lr = Pipeline(steps=[("preprocessor", ct1), ("classifier",
                                                       LogisticRegression())])
Pipeline2_lr = Pipeline(steps=[("preprocessor", ct2), ("classifier",
                                                       LogisticRegression())])
Pipeline3_lr = Pipeline(steps=[("preprocessor", ct3), ("classifier",
                                                       LogisticRegression())])

In [14]:
# Logistic Regression tuning
param_grid = {
    'classifier__C': [1, 5, 10],
    'classifier__solver': ['liblinear', 'newton-cg']
}

lr_pipelines = [Pipeline0_lr, Pipeline1_lr, Pipeline2_lr, Pipeline3_lr]
for i, X in enumerate(Xs):
    pipeline = lr_pipelines[i]
    
    # Perform grid search
    grid_search = GridSearchCV(pipeline, param_grid, scoring="f1", cv=skf_inner, n_jobs=-1)
    best_model = grid_search.fit(X, ys[i]).best_estimator_
    
    # Print the best estimator for the current dataset
    print(f"Best model for dataset {i}: {best_model}")


Best model for dataset 0: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('standardised',
                                                  StandardScaler(), ['A2']),
                                                 ('robust', RobustScaler(),
                                                  ['A3', 'A5', 'A7', 'A10',
                                                   'A13', 'A14']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['A4', 'A6', 'A12'])])),
                ('classifier', LogisticRegression(C=1, solver='liblinear'))])
Best model for dataset 1: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('impute_and_scale',
                                                  Pipeline(steps=[('imputer',
                                                                

Results

In [None]:
Pipeline0_lr = Pipeline(steps=[("preprocessor", ct0), ("classifier",
                                                       LogisticRegression(C=1, solver='liblinear'))])
Pipeline1_lr = Pipeline(steps=[("preprocessor", ct1), ("classifier",
                                                       LogisticRegression(C=1, solver='newton-cg'))])
Pipeline2_lr = Pipeline(steps=[("preprocessor", ct2), ("classifier",
                                                       LogisticRegression(C=1, solver='newton-cg'))])
Pipeline3_lr = Pipeline(steps=[("preprocessor", ct3), ("classifier",
                                                       LogisticRegression(C=1, solver='newton-cg'))])

Tuning for Random Forest

In [16]:
Pipeline0_rf = Pipeline(steps=[("preprocessor", ct0), ("classifier",
                        RandomForestClassifier(random_state=7))])
Pipeline1_rf = Pipeline(steps=[("preprocessor", ct1), ("classifier",
                                                       RandomForestClassifier(random_state=7))])
Pipeline2_rf = Pipeline(steps=[("preprocessor", ct2), ("classifier",
                                                       RandomForestClassifier(random_state=7))])
Pipeline3_rf = Pipeline(steps=[("preprocessor", ct3), ("classifier",
                                                       RandomForestClassifier(random_state=7))])

In [17]:
# Random Forest tuning

param_grid = {
    'classifier__max_depth': [None, 10, 20, 30],       
    'classifier__min_samples_split': [2, 5, 10],        
    'classifier__min_samples_leaf': [1, 2, 4], 
    'classifier__max_features': ['sqrt', 'log2'],          
}

rf_pipelines = [Pipeline0_rf, Pipeline1_rf, Pipeline2_rf, Pipeline3_rf]

for i, X in enumerate(Xs):
    pipeline = rf_pipelines[i]
    
    # Perform grid search
    random_search = RandomizedSearchCV(pipeline, param_grid, scoring="f1", cv=skf_inner, verbose=1, n_jobs=-1)
    best_model = random_search.fit(X, ys[i]).best_estimator_
    
    # Print the best estimator for the current dataset
    print(f"Best model for dataset {i}: {best_model}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best model for dataset 0: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('standardised',
                                                  StandardScaler(), ['A2']),
                                                 ('robust', RobustScaler(),
                                                  ['A3', 'A5', 'A7', 'A10',
                                                   'A13', 'A14']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['A4', 'A6', 'A12'])])),
                ('classifier',
                 RandomForestClassifier(max_depth=20, max_features='log2',
                                        min_samples_split=10,
                                        random_state=7))])
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
Pipeline0_rf = Pipeline(
    steps=[
        ("preprocessor", ct0),
        (
            "classifier",
            RandomForestClassifier(
                max_depth=20, max_features="log2", min_samples_split=10, random_state=7
            ),
        ),
    ]
)
Pipeline1_rf = Pipeline(
    steps=[
        ("preprocessor", ct1),
        (
            "classifier",
            RandomForestClassifier(
                max_depth=20, max_features="log2", min_samples_split=5, random_state=7
            ),
        ),
    ]
)
Pipeline2_rf = Pipeline(
    steps=[
        ("preprocessor", ct2),
        ("classifier", RandomForestClassifier(random_state=7)),
    ]
)
Pipeline3_rf = Pipeline(
    steps=[
        ("preprocessor", ct3),
        (
            "classifier",
            RandomForestClassifier(max_depth=30, min_samples_split=5, random_state=7),
        ),
    ]
)

Tuning for KNN

In [18]:
Pipeline0_knn = Pipeline(steps=[("preprocessor", ct0), ("classifier",
                                                        KNeighborsClassifier())])
Pipeline1_knn = Pipeline(steps=[("preprocessor", ct1), ("classifier",
                                                        KNeighborsClassifier())])
Pipeline2_knn = Pipeline(steps=[("preprocessor", ct2), ("classifier",
                                                        KNeighborsClassifier())])
Pipeline3_knn = Pipeline(steps=[("preprocessor", ct3), ("classifier",
                                                        KNeighborsClassifier())])

In [19]:
# knn tuning

param_grid = {
    'classifier__n_neighbors': [3, 5, 7, 10, 15],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__leaf_size': [20, 30, 40],
}

knn_pipelines = [Pipeline0_knn, Pipeline1_knn, Pipeline2_knn, Pipeline3_knn]

for i, X in enumerate(Xs):
    pipeline = knn_pipelines[i]
    
    # Perform grid search
    random_search = RandomizedSearchCV(pipeline, param_grid, scoring="f1", cv=skf_inner, verbose=1, n_jobs=-1)
    best_model = random_search.fit(X, ys[i]).best_estimator_
    
    # Print the best estimator for the current dataset
    print(f"Best model for dataset {i}: {best_model}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best model for dataset 0: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('standardised',
                                                  StandardScaler(), ['A2']),
                                                 ('robust', RobustScaler(),
                                                  ['A3', 'A5', 'A7', 'A10',
                                                   'A13', 'A14']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['A4', 'A6', 'A12'])])),
                ('classifier',
                 KNeighborsClassifier(n_neighbors=15, weights='distance'))])
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best model for dataset 1: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('impute_an

In [None]:
Pipeline0_knn = Pipeline(
    steps=[
        ("preprocessor", ct0),
        ("classifier", KNeighborsClassifier(n_neighbors=15, weights="distance")),
    ]
)
Pipeline1_knn = Pipeline(
    steps=[
        ("preprocessor", ct1),
        (
            "classifier",
            KNeighborsClassifier(leaf_size=40, n_neighbors=3, weights="distance"),
        ),
    ]
)
Pipeline2_knn = Pipeline(
    steps=[
        ("preprocessor", ct2),
        ("classifier", KNeighborsClassifier(weights="distance")),
    ]
)
Pipeline3_knn = Pipeline(
    steps=[
        ("preprocessor", ct3),
        (
            "classifier",
            KNeighborsClassifier(leaf_size=40, n_neighbors=10, weights="distance"),
        ),
    ]
)

Decision Trees

In [20]:
Pipeline0_dt = Pipeline(steps=[("preprocessor", ct0), ("classifier",
                                                       DecisionTreeClassifier(random_state=7))])
Pipeline1_dt = Pipeline(steps=[("preprocessor", ct1), ("classifier",
                                                       DecisionTreeClassifier(random_state=7))])
Pipeline2_dt = Pipeline(steps=[("preprocessor", ct2), ("classifier",
                                                       DecisionTreeClassifier(random_state=7))])
Pipeline3_dt = Pipeline(steps=[("preprocessor", ct3), ("classifier",
                                                       DecisionTreeClassifier(random_state=7))])

In [22]:
# dt tuning

param_grid = {
    'classifier__max_depth': [None, 5, 10, 15], 
    'classifier__min_samples_leaf': [1, 2, 4, 6],
    'classifier__max_features': ['sqrt', 'log2']
}

dt_pipelines = [Pipeline0_dt, Pipeline1_dt, Pipeline2_dt, Pipeline3_dt]

for i in range(4):
    grid_search = GridSearchCV(dt_pipelines[i], param_grid, scoring="f1", cv=skf_inner, verbose=1, n_jobs=-1)
    best_estim = grid_search.fit(Xs[i], ys[i]).best_estimator_
    print(best_estim)
    dt_pipelines[i] = best_estim
Pipeline0_dt, Pipeline1_dt, Pipeline2_dt, Pipeline3_dt = dt_pipelines

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('standardised',
                                                  StandardScaler(), ['A2']),
                                                 ('robust', RobustScaler(),
                                                  ['A3', 'A5', 'A7', 'A10',
                                                   'A13', 'A14']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['A4', 'A6', 'A12'])])),
                ('classifier',
                 DecisionTreeClassifier(max_depth=10, max_features='sqrt',
                                        min_samples_leaf=6, random_state=7))])
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Pipeline(steps=[('preprocessor',
                 ColumnTransforme

In [None]:
Pipeline0_dt = Pipeline(
    steps=[
        ("preprocessor", ct0),
        (
            "classifier",
            DecisionTreeClassifier(
                max_depth=10, max_features="sqrt", min_samples_leaf=6, random_state=7
            ),
        ),
    ]
)
Pipeline1_dt = Pipeline(
    steps=[
        ("preprocessor", ct1),
        (
            "classifier",
            DecisionTreeClassifier(
                max_features="sqrt", min_samples_leaf=4, random_state=7
            ),
        ),
    ]
)
Pipeline2_dt = Pipeline(
    steps=[
        ("preprocessor", ct2),
        (
            "classifier",
            DecisionTreeClassifier(
                max_depth=5, max_features="sqrt", min_samples_leaf=4, random_state=7
            ),
        ),
    ]
)
Pipeline3_dt = Pipeline(
    steps=[
        ("preprocessor", ct3),
        (
            "classifier",
            DecisionTreeClassifier(max_depth=10, max_features="sqrt", random_state=7),
        ),
    ]
)

In [23]:
# svm tuning

param_grid = {
    'classifier__C': [0.1, 1, 10, 100] # Reg param
}
Pipeline0_svm = Pipeline(steps=[("preprocessor", ct0), ("classifier",
                                                        SVC(random_state=7, max_iter=10000, probability=True))])
Pipeline1_svm = Pipeline(steps=[("preprocessor", ct1), ("classifier",
                                                        LinearSVC(random_state=7, max_iter=10000))])
Pipeline2_svm = Pipeline(steps=[("preprocessor", ct2), ("classifier",
                                                        SVC(random_state=7, max_iter=10000, probability=True))])
Pipeline3_svm = Pipeline(steps=[("preprocessor", ct3), ("classifier",
                                                        LinearSVC(random_state=7, max_iter=10000))])

svm_pipelines = [Pipeline0_svm, Pipeline1_svm, Pipeline2_svm, Pipeline3_svm]

for i in range(4):
    grid_search = GridSearchCV(svm_pipelines[i], param_grid, scoring="f1", cv=skf_inner, verbose=1, n_jobs=-1)
    best_estim = grid_search.fit(Xs[i], ys[i]).best_estimator_
    print(best_estim)
    svm_pipelines[i] = best_estim
Pipeline0_svm, Pipeline1_svm, Pipeline2_svm, Pipeline3_svm = svm_pipelines


Fitting 5 folds for each of 4 candidates, totalling 20 fits
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('standardised',
                                                  StandardScaler(), ['A2']),
                                                 ('robust', RobustScaler(),
                                                  ['A3', 'A5', 'A7', 'A10',
                                                   'A13', 'A14']),
                                                 ('categorical',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['A4', 'A6', 'A12'])])),
                ('classifier',
                 SVC(C=10, max_iter=10000, probability=True, random_state=7))])
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('impute_and_scale',
                                         

In [None]:
# SVM pipelines
Pipeline0_svm = Pipeline(steps=[("preprocessor", ct0), ("classifier",
                                                        SVC(C=10, max_iter=10000, probability=True, random_state=7))])
Pipeline1_svm = Pipeline(steps=[("preprocessor", ct1), ("classifier",
                                                        LinearSVC(C=1, max_iter=10000, random_state=7))])
Pipeline2_svm = Pipeline(steps=[("preprocessor", ct2), ("classifier",
                                                        SVC(C=10, max_iter=10000, probability=True, random_state=7))])
Pipeline3_svm = Pipeline(steps=[("preprocessor", ct3), ("classifier",
                                                        LinearSVC(C=0.1, max_iter=10000, random_state=7))])