### Data prepatation

In [59]:
# Some required libraries 
import csv
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import random
import warnings

warnings.filterwarnings('ignore')
random.seed(21121)

# A first function to dowlnload the datasets
def loadCsv(path):
    data = []
    with open(path, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for row in reader:
            data.append(np.array(row))
    data = np.array(data)
    (n, d) = data.shape
    return data, n, d

# Encode Categorical variables
def oneHotEncodeColumns(data, columnsCategories):
    dataCategories = data[:, columnsCategories]
    dataEncoded = OneHotEncoder(sparse=False).fit_transform(dataCategories)
    columnsNumerical = []
    for i in range(data.shape[1]):
        if i not in columnsCategories:
            columnsNumerical.append(i)
    dataNumerical = data[:, columnsNumerical]
    return np.hstack((dataNumerical, dataEncoded)).astype(float)

# Another function to prepare the data
def data_recovery(dataset):
    if dataset in ['abalone8', 'abalone17', 'abalone20']:
        data = pd.read_csv("datasets/abalone.data", header=None)
        data = pd.get_dummies(data, dtype=float)
        if dataset in ['abalone8']:
            y = np.array([1 if elt == 8 else 0 for elt in data[8]])
        elif dataset in ['abalone17']:
            y = np.array([1 if elt == 17 else 0 for elt in data[8]])
        elif dataset in ['abalone20']:
            y = np.array([1 if elt == 20 else 0 for elt in data[8]])
        X = np.array(data.drop([8], axis=1))
    elif dataset in ['autompg']:
        data = pd.read_csv("datasets/auto-mpg.data", header=None, sep=r'\s+')
        data = data.replace('?', np.nan)
        data = data.dropna()
        data = data.drop([8], axis=1)
        data = data.astype(float)
        y = np.array([1 if elt in [2, 3] else 0 for elt in data[7]])
        X = np.array(data.drop([7], axis=1))
    elif dataset in ['australian']:
        data, n, d = loadCsv('datasets/australian.data')
        X = data[:, np.arange(d-1)].astype(float)
        y = data[:, d-1].astype(int)
        y[y != 1] = 0
    elif dataset in ['balance']:
        data = pd.read_csv("datasets/balance-scale.data", header=None)
        y = np.array([1 if elt in ['L'] else 0 for elt in data[0]])
        X = np.array(data.drop([0], axis=1))
    elif dataset in ['bankmarketing']:
        data, n, d = loadCsv('datasets/bankmarketing.csv')
        X = data[:, np.arange(0, d-1)]
        X = oneHotEncodeColumns(X, [1, 2, 3, 4, 6, 7, 8, 10, 15])
        y = data[:, d-1]
        y[y == "no"] = "0"
        y[y == "yes"] = "1"
        y = y.astype(int)
    elif dataset in ['bupa']:
        data, n, d = loadCsv('datasets/bupa.dat')
        X = data[:, np.arange(d-1)].astype(float)
        y = data[:, d-1].astype(int)
        y[y != 1] = 0
    elif dataset in ['german']:
        data = pd.read_csv("datasets/german.data-numeric", header=None,
                           sep=r'\s+')
        y = np.array([1 if elt == 2 else 0 for elt in data[24]])
        X = np.array(data.drop([24], axis=1))
    elif dataset in ['glass']:
        data = pd.read_csv("datasets/glass.data", header=None, index_col=0)
        y = np.array([1 if elt == 1 else 0 for elt in data[10]])
        X = np.array(data.drop([10], axis=1))
    elif dataset in ['hayes']:
        data = pd.read_csv("datasets/hayes-roth.data", header=None)
        y = np.array([1 if elt in [3] else 0 for elt in data[5]])
        X = np.array(data.drop([0, 5], axis=1))
    elif dataset in ['heart']:
        data, n, d = loadCsv('datasets/heart.data')
        X = data[:, np.arange(d-1)].astype(float)
        y = data[:, d-1]
        y = y.astype(int)
        y[y != 2] = 0
        y[y == 2] = 1
    elif dataset in ['iono']:
        data = pd.read_csv("datasets/ionosphere.data", header=None)
        y = np.array([1 if elt in ['b'] else 0 for elt in data[34]])
        X = np.array(data.drop([34], axis=1))
    elif dataset in ['libras']:
        data = pd.read_csv("datasets/movement_libras.data", header=None)
        y = np.array([1 if elt in [1] else 0 for elt in data[90]])
        X = np.array(data.drop([90], axis=1))
    elif dataset == "newthyroid":
        data, n, d = loadCsv('datasets/newthyroid.dat')
        X = data[:, np.arange(d-1)].astype(float)
        y = data[:, d-1].astype(int)
        y[y < 2] = 0
        y[y >= 2] = 1
    elif dataset in ['pageblocks']:
        data = pd.read_csv("datasets/page-blocks.data", header=None,
                           sep=r'\s+')
        y = np.array([1 if elt in [2, 3, 4, 5] else 0 for elt in data[10]])
        X = np.array(data.drop([10], axis=1))
    elif dataset in ['pima']:
        data, n, d = loadCsv('datasets/pima-indians-diabetes.data')
        X = data[:, np.arange(d-1)].astype(float)
        y = data[:, d-1]
        y[y != '1'] = '0'
        y = y.astype(int)
    elif dataset in ['satimage']:
        data, n, d = loadCsv('datasets/satimage.data')
        X = data[:, np.arange(d-1)].astype(float)
        y = data[:, d-1]
        y = y.astype(int)
        y[y != 4] = 0
        y[y == 4] = 1
    elif dataset in ['segmentation']:
        data, n, d = loadCsv('datasets/segmentation.data')
        X = data[:, np.arange(1, d)].astype(float)
        y = data[:, 0]
        y[y == "WINDOW"] = '1'
        y[y != '1'] = '0'
        y = y.astype(int)
    elif dataset == "sonar":
        data, n, d = loadCsv('datasets/sonar.dat')
        X = data[:, np.arange(d-1)].astype(float)
        y = data[:, d-1]
        y[y != 'R'] = '0'
        y[y == 'R'] = '1'
        y = y.astype(int)
    elif dataset == "spambase":
        data, n, d = loadCsv('datasets/spambase.dat')
        X = data[:, np.arange(d-1)].astype(float)
        y = data[:, d-1].astype(int)
        y[y != 1] = 0
    elif dataset == "splice":
        data, n, d = loadCsv('datasets/splice.data')
        X = data[:, np.arange(1, d)].astype(float)
        y = data[:, 0].astype(int)
        y[y == 1] = 2
        y[y == -1] = 1
        y[y == 2] = 0
    elif dataset in ['vehicle']:
        data, n, d = loadCsv('datasets/vehicle.data')
        X = data[:, np.arange(d-1)].astype(float)
        y = data[:, d-1]
        y[y != "van"] = '0'
        y[y == "van"] = '1'
        y = y.astype(int)
    elif dataset in ['wdbc']:
        data, n, d = loadCsv('datasets/wdbc.dat')
        X = data[:, np.arange(d-1)].astype(float)
        y = data[:, d-1]
        y[y != 'M'] = '0'
        y[y == 'M'] = '1'
        y = y.astype(int)
    elif dataset in ['wine']:
        data = pd.read_csv("datasets/wine.data", header=None)
        y = np.array([1 if elt == 1 else 0 for elt in data[0]])
        X = np.array(data.drop([0], axis=1))
    elif dataset in ['wine4']:
        data = pd.read_csv("datasets/winequality-red.csv", sep=';')
        y = np.array([1 if elt in [4] else 0 for elt in data.quality])
        X = np.array(data.drop(["quality"], axis=1))
    elif dataset in ['yeast3', 'yeast6']:
        data = pd.read_csv("datasets/yeast.data", header=None, sep=r'\s+')
        data = data.drop([0], axis=1)
        if dataset == 'yeast3':
            y = np.array([1 if elt == 'ME3' else 0 for elt in data[9]])
        elif dataset == 'yeast6':
            y = np.array([1 if elt == 'EXC' else 0 for elt in data[9]])
        X = np.array(data.drop([9], axis=1))
    return X, y

### Data loading

In [60]:
import matplotlib.pyplot as plt

dataset_names = ['abalone8', 'abalone17', 'abalone20', 'autompg', 'australian',
            'balance', 'bankmarketing', 'bupa', 'german', 'glass', 'hayes',
            'heart', 'iono', 'libras', 'newthyroid', 'pageblocks', 'pima',
            'satimage', 'segmentation', 'sonar', 'spambase', 'splice',
            'vehicle', 'wdbc', 'wine', 'wine4', 'yeast3', 'yeast6']

datasets = {
    "name":[],
    "X":[],
    "y":[],
    "balanced":[]
}

for dataset_name in dataset_names:
    X, y = data_recovery(dataset_name)
    # if dataset is bankmarketing, we keep 1/6 of the data
    if dataset_name == "bankmarketing":
        X = X[::6]
        y = y[::6]
    datasets["name"].append(dataset_name)
    datasets["X"].append(X)
    datasets["y"].append(y)
    balanced = np.mean(y) > 0.1 and np.mean(y) < 0.9
    datasets["balanced"].append(balanced)
datasets = pd.DataFrame(datasets)

# plot datasets imbalanced datasets Y distribution
# plt.figure(figsize=(20,10))
# plt.bar(datasets[datasets["balanced"] == False]["name"], datasets[datasets["balanced"] == False]["y"].apply(np.mean))
# plt.xticks(rotation=45)
# plt.ylabel("Y distribution")
# plt.title("Imbalanced datasets Y distribution")
# plt.show()

### Sampling strategy

In [None]:
# import undersampling and oversampling methods
from imblearn.under_sampling import TomekLinks, CondensedNearestNeighbour
from imblearn.over_sampling import BorderlineSMOTE, ADASYN

tomekLink = TomekLinks(sampling_strategy='majority')
cnn = CondensedNearestNeighbour(sampling_strategy='majority')

borderSMOTE = BorderlineSMOTE(sampling_strategy='minority')
adaSyn = ADASYN(sampling_strategy='minority')

samplers = {
    "name":[],
    "sampler":[]
}

for sampler_name, sampler in zip(["tomekLink", "cnn", "borderSMOTE", "adaSyn"], [tomekLink, cnn, borderSMOTE, adaSyn]):
    samplers["name"].append(sampler_name)
    samplers["sampler"].append(sampler)


imbalanced_datasets = datasets[datasets["balanced"] == False]

# undersample imbalanced datasets
for sampler_name, sampler in zip(samplers["name"], samplers["sampler"]):
    for dataset_name, X, y in zip(imbalanced_datasets["name"], imbalanced_datasets["X"], imbalanced_datasets["y"]):
        X_resampled, y_resampled = sampler.fit_resample(X, y)
        new_name = dataset_name + "_" + sampler_name
        datasets = datasets._append({"name":new_name, "X":X_resampled, "y":y_resampled, "balanced":False}, ignore_index=True)



### Data classification

In [None]:
# import the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# import model selection
from sklearn.model_selection import GridSearchCV, train_test_split

# import the metrics
from sklearn.metrics import accuracy_score, f1_score

# import time to measure the time
import time

svm_params = {'C':[0.1,1,10], 'kernel': ['rbf', 'linear, poly', 'sigmoid']}
clf_params = {'penalty': ['l1', 'l2', 'elasticnet'], 'solver': ['liblinear', 'saga'], 'l1_ratio': [0.5, 0.7, 0.9]}
decision_tree_params = {'criterion': ['gini', 'entropy'], 'max_depth': [2, 4, 6]}

svm_grid = GridSearchCV(SVC(), svm_params)
clf_grid = GridSearchCV(LogisticRegression(), clf_params)
decision_tree_grid = GridSearchCV(DecisionTreeClassifier(), decision_tree_params)

clfs = {  
    "LogisticRegression": clf_grid,
    "SVM": svm_grid,
    "DecisionTree": decision_tree_grid,
}

table = {
    "dataset": [],
    "classifier": [],
    "score": [],
    "time": [],
    "balanced": []
}

for d in datasets.to_dict(orient="records"):
    print(d["name"])
    table["dataset"].append(d["name"])
    table["balanced"].append(d["balanced"])
    X_train, X_test, y_train, y_test = train_test_split(d["X"], d["y"], test_size=0.3)
    scores = []
    times = []
    for key,clf in clfs.items():
        start = time.time()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        end = time.time()
        if d["balanced"] == True:
            score = accuracy_score(y_test, y_pred)
        else:
            score = f1_score(y_test, y_pred, average='micro')
        scores.append(round(score, 2))
        times.append(round(end - start, 2))
    table["score"].append("$" + str(min(scores)) + " \pm " + str(max(scores)) + "$")
    min_index = scores.index(min(scores))
    max_index = scores.index(max(scores))
    min_max_clfs = "$" + list(clfs.keys())[min_index] + " \pm " + list(clfs.keys())[max_index] + "$"
    table["classifier"].append(min_max_clfs)
    table["time"].append("$" + str(min(times)) + " \pm " + str(max(times)) + "$")

table = pd.DataFrame(table)
table.to_csv("classifiers_results.csv", sep=';', encoding='utf-8', index=False)

abalone8
abalone17
abalone20
autompg
australian
balance
bankmarketing
bupa
german
glass
hayes
heart
iono
libras
newthyroid
pageblocks
pima
satimage
segmentation
sonar
spambase
splice
vehicle
wdbc
wine
wine4
yeast3
yeast6
abalone17_tomeLink
abalone20_tomeLink
libras_tomeLink
satimage_tomeLink
wine4_tomeLink
yeast6_tomeLink
abalone17_cnn
abalone20_cnn
libras_cnn
satimage_cnn
wine4_cnn
yeast6_cnn
abalone17_borderSMOTE
abalone20_borderSMOTE
libras_borderSMOTE
satimage_borderSMOTE
wine4_borderSMOTE
yeast6_borderSMOTE
abalone17_adaSyn
abalone20_adaSyn
libras_adaSyn
satimage_adaSyn
wine4_adaSyn
yeast6_adaSyn


### Data ensemblist methods

In [None]:
# import the classifiers
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier

# import the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# import model selection
from sklearn.model_selection import GridSearchCV, train_test_split

# import the metrics
from sklearn.metrics import accuracy_score, f1_score

# import time to measure the time
import time



bagging = BaggingClassifier()
random_forest = RandomForestClassifier()
adaboost = AdaBoostClassifier()
gradient_boosting = GradientBoostingClassifier()

tree_svm = StackingClassifier(
    estimators=[
        ('tree', DecisionTreeClassifier()),
    ],
    final_estimator=SVC(kernel='linear'),
)

tree_tree = StackingClassifier(
    estimators=[
        ('tree', DecisionTreeClassifier()),
    ],
    final_estimator=DecisionTreeClassifier(),
)

svm_svm = StackingClassifier(
    estimators=[
        ('svm', SVC(kernel='linear')),
    ],
    final_estimator=SVC(kernel='linear'),
)

clfs = {
    "Bagging": bagging,
    "RandomForest": random_forest,
    "AdaBoost": adaboost,
    "GradientBoosting": gradient_boosting,
    "TreeSVM": tree_svm,
    "TreeTree": tree_tree,
    "SvmSvm": svm_svm
}

table = {
    "dataset": [],
    "classifiers": [],
    "score": [],
    "time": [],
    "balanced": []
}

# create a dataframe of bupa, heart, sonar, german, australian , heart
df = []
df.append(datasets.loc[datasets["name"] == "bupa"].iloc[0])
df.append(datasets.loc[datasets["name"] == "heart"].iloc[0])
df.append(datasets.loc[datasets["name"] == "sonar"].iloc[0])
df.append(datasets.loc[datasets["name"] == "german"].iloc[0])
df.append(datasets.loc[datasets["name"] == "australian"].iloc[0])

for d in df:
    print(d["name"])
    X_train, X_test, y_train, y_test = train_test_split(d["X"], d["y"], test_size=0.3)
    for key,clf in clfs.items():
        table["dataset"].append(d["name"])
        table["balanced"].append(d["balanced"])
        table["classifiers"].append(key)
        if d["name"] != "australian" or key != "SvmSvm":
            start = time.time()
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            end = time.time()
            if d["balanced"] == True:
                score = accuracy_score(y_test, y_pred)
            else:
                score = f1_score(y_test, y_pred, average='micro')
            table["score"].append(round(score, 2))
            table["time"].append(round(end - start, 2))
        else:
            table["score"].append("NA")
            table["time"].append("NA")

table = pd.DataFrame(table)
table.to_csv("ensemble_methods_results.csv", sep=';', encoding='utf-8', index=False)

bupa
heart
sonar
german
australian
