In [1]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import os
import random
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
seed=1234
random.seed(seed)
np.random.seed(seed)
Datasets=['Matek', 'Acevedo']

In [3]:
x = np.load('./X.npy')
y = np.load('./y.npy')
dataset = np.load('./dataset.npy')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [4]:
#preprocess for xgboost
y[y==10]=-1
y[y==3]=10
y[y==-1]=3

In [5]:
X={}
Y={}
for ds in range(len(Datasets)):
    X[ds] = x[dataset == ds]
    Y[ds] = y[dataset == ds]

In [6]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
cm=[np.zeros((13,13)), np.zeros((11,11))]
print("RandomForestClassifier :")
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        rf = RandomForestClassifier(n_estimators=200, max_depth=16, n_jobs=64, random_state=seed)
        rf.fit(X[ds][train_index],Y[ds][train_index])
        pred = rf.predict(X[ds][test_index])
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = rf.predict(x[dataset == val_ds])
            cm[val_ds] += confusion_matrix(y[dataset == val_ds], pred)
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

for ds in Datasets:
    print("train on {} :".format(ds))
    for val_ds in Datasets:
        print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))


RandomForestClassifier :
train on Matek :
test on Matek, acc mean : 0.890301, acc std : 0.004315
test on Acevedo, acc mean : 0.240707, acc std : 0.004479
train on Acevedo :
test on Matek, acc mean : 0.519426, acc std : 0.008169
test on Acevedo, acc mean : 0.704119, acc std : 0.01135


In [7]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print("XGBoost :")
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        xgboost = XGBClassifier(random_state=seed)
        xgboost.fit(X[ds][train_index],Y[ds][train_index])
        pred = xgboost.predict(X[ds][test_index])
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = xgboost.predict(x[dataset == val_ds])
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    print("train on {} :".format(ds))
    for val_ds in Datasets:
        print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

XGBoost :
train on Matek :
test on Matek, acc mean : 0.914182, acc std : 0.00589
test on Acevedo, acc mean : 0.368452, acc std : 0.01036
train on Acevedo :
test on Matek, acc mean : 0.492231, acc std : 0.0103
test on Acevedo, acc mean : 0.745239, acc std : 0.007921


In [8]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print('SVM(poly) :')
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        scaler = StandardScaler()
        svc = SVC(kernel='poly', random_state=seed)
        svc.fit(scaler.fit_transform(X[ds][train_index]),Y[ds][train_index])
        pred = svc.predict(scaler.transform(X[ds][test_index]))
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = svc.predict(scaler.transform(x[dataset == val_ds]))
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    print("train on {} :".format(ds))
    for val_ds in Datasets:
        print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

SVM(poly) :
train on Matek :
test on Matek, acc mean : 0.909547, acc std : 0.004395
test on Acevedo, acc mean : 0.247693, acc std : 0.00651
train on Acevedo :
test on Matek, acc mean : 0.508947, acc std : 0.004166
test on Acevedo, acc mean : 0.730111, acc std : 0.004478


In [9]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print('SVM(rbf) :')
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        scaler = StandardScaler()
        svc = SVC(kernel='rbf', random_state=seed)
        svc.fit(scaler.fit_transform(X[ds][train_index]),Y[ds][train_index])
        pred = svc.predict(scaler.transform(X[ds][test_index]))
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = svc.predict(scaler.transform(x[dataset == val_ds]))
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    print("train on {} :".format(ds))
    for val_ds in Datasets:
        print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

SVM(rbf) :
train on Matek :
test on Matek, acc mean : 0.926122, acc std : 0.004887
test on Acevedo, acc mean : 0.475005, acc std : 0.006272
train on Acevedo :
test on Matek, acc mean : 0.345303, acc std : 0.01381
test on Acevedo, acc mean : 0.782783, acc std : 0.005986


In [10]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print('mlp :')
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        scaler = StandardScaler()
        mlp = MLPClassifier(max_iter=1000, early_stopping=True, random_state=seed)
        mlp.fit(scaler.fit_transform(X[ds][train_index]),Y[ds][train_index])
        pred = mlp.predict(scaler.transform(X[ds][test_index]))
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = mlp.predict(scaler.transform(x[dataset == val_ds]))
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    print("train on {} :".format(ds))
    for val_ds in Datasets:
        print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

mlp :
train on Matek :
test on Matek, acc mean : 0.922851, acc std : 0.003538
test on Acevedo, acc mean : 0.412157, acc std : 0.0105
train on Acevedo :
test on Matek, acc mean : 0.429355, acc std : 0.02419
test on Acevedo, acc mean : 0.77742, acc std : 0.009402


In [11]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print('LogisticRegression :')
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        scaler = StandardScaler()
        lr = LogisticRegression(max_iter=500, random_state=seed)
        lr.fit(scaler.fit_transform(X[ds][train_index]),Y[ds][train_index])
        pred = lr.predict(scaler.transform(X[ds][test_index]))
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = lr.predict(scaler.transform(x[dataset == val_ds]))
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    print("train on {} :".format(ds))
    for val_ds in Datasets:
        print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

LogisticRegression :
train on Matek :
test on Matek, acc mean : 0.917889, acc std : 0.004127
test on Acevedo, acc mean : 0.346861, acc std : 0.005594
train on Acevedo :
test on Matek, acc mean : 0.40771, acc std : 0.007263
test on Acevedo, acc mean : 0.756241, acc std : 0.009976
