In [69]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import os
import random
import matplotlib.pyplot as plt
# import seaborn as sns

In [70]:
seed=1234
random.seed(seed)
np.random.seed(seed)
Datasets=['Matek', 'Acevedo','SYSU3H']
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

In [71]:
x = np.load('./X.npy')
y = np.load('./y.npy')
dataset = np.load('./dataset.npy')
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
print(x.shape)

(33215, 768)


In [72]:
#preprocess for xgboost
y[y==10]=-1
y[y==3]=10
y[y==-1]=3

In [73]:
X={}
Y={}
for ds in range(len(Datasets)):
    X[ds] = x[dataset == ds]
    Y[ds] = y[dataset == ds]

In [74]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
cm=[np.zeros((13,13)), np.zeros((11,11))]
print("RandomForestClassifier :")
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        rf = RandomForestClassifier(n_estimators=200, max_depth=16, n_jobs=64, random_state=seed)
        rf.fit(X[ds][train_index],Y[ds][train_index])
        pred = rf.predict(X[ds][test_index])
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = rf.predict(x[dataset == val_ds])
            # cm[val_ds] += confusion_matrix(y[dataset == val_ds], pred)
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

for ds in Datasets:
    if ds != 'SYSU3H':
        print("train on {} :".format(ds))
        for val_ds in Datasets:
            print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))


RandomForestClassifier :
train on Matek :
test on Matek, acc mean : 0.920887, acc std : 0.003011
test on Acevedo, acc mean : 0.376153, acc std : 0.02423
test on SYSU3H, acc mean : 0.114804, acc std : 0.03685
train on Acevedo :
test on Matek, acc mean : 0.238896, acc std : 0.02262
test on Acevedo, acc mean : 0.853882, acc std : 0.00679
test on SYSU3H, acc mean : 0.233837, acc std : 0.02733
train on SYSU3H :
test on Matek, acc mean : 0.314476, acc std : 0.03109
test on Acevedo, acc mean : 0.135873, acc std : 0.0201
test on SYSU3H, acc mean : 0.843057, acc std : 0.04481


In [75]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print("XGBoost :")
for ds in range(len(Datasets)):
    if Datasets[ds] != 'SYSU3H':
        for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
            xgboost = XGBClassifier(tree_method = "hist", device = "cuda",random_state=seed)
            xgboost.fit(X[ds][train_index],Y[ds][train_index])
            pred = xgboost.predict(X[ds][test_index])
            accuracy = accuracy_score(Y[ds][test_index], pred)
            result[Datasets[ds]][Datasets[ds]][fold] = accuracy
            for val_ds in range(len(Datasets)):
                if val_ds == ds:
                    continue
                pred = xgboost.predict(x[dataset == val_ds])
                accuracy = accuracy_score(y[dataset == val_ds], pred)
                result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    if ds != 'SYSU3H':
        print("train on {} :".format(ds))
        for val_ds in Datasets:
            print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

XGBoost :
train on Matek :
test on Matek, acc mean : 0.946458, acc std : 0.003101
test on Acevedo, acc mean : 0.552692, acc std : 0.01604
test on SYSU3H, acc mean : 0.116012, acc std : 0.03886
train on Acevedo :
test on Matek, acc mean : 0.323428, acc std : 0.02919
test on Acevedo, acc mean : 0.875198, acc std : 0.004229
test on SYSU3H, acc mean : 0.194562, acc std : 0.06716


In [76]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print('SVM(poly) :')
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        scaler = StandardScaler()
        svc = SVC(kernel='poly', random_state=seed)
        svc.fit(scaler.fit_transform(X[ds][train_index]),Y[ds][train_index])
        pred = svc.predict(scaler.transform(X[ds][test_index]))
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = svc.predict(scaler.transform(x[dataset == val_ds]))
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    if ds != 'SYSU3H':
        print("train on {} :".format(ds))
        for val_ds in Datasets:
            print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

SVM(poly) :
train on Matek :
test on Matek, acc mean : 0.947058, acc std : 0.002776
test on Acevedo, acc mean : 0.488909, acc std : 0.005737
test on SYSU3H, acc mean : 0.159517, acc std : 0.03712
train on Acevedo :
test on Matek, acc mean : 0.274184, acc std : 0.04825
test on Acevedo, acc mean : 0.877192, acc std : 0.005145
test on SYSU3H, acc mean : 0.150453, acc std : 0.01943
train on SYSU3H :
test on Matek, acc mean : 0.349894, acc std : 0.04106
test on Acevedo, acc mean : 0.234078, acc std : 0.05661
test on SYSU3H, acc mean : 0.667797, acc std : 0.04385


In [77]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print('SVM(linear) :')
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        scaler = StandardScaler()
        svc = SVC(kernel='linear', random_state=seed)
        svc.fit(scaler.fit_transform(X[ds][train_index]),Y[ds][train_index])
        pred = svc.predict(scaler.transform(X[ds][test_index]))
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = svc.predict(scaler.transform(x[dataset == val_ds]))
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            # if val_ds==2 and ds == 1:
            #     print(pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    if ds =='SYSU3H':
        continue
    print("train on {} :".format(ds))
    for val_ds in Datasets:
        print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

SVM(linear) :
train on Matek :
test on Matek, acc mean : 0.941225, acc std : 0.00382
test on Acevedo, acc mean : 0.612542, acc std : 0.02568
test on SYSU3H, acc mean : 0.190937, acc std : 0.07401
train on Acevedo :
test on Matek, acc mean : 0.430587, acc std : 0.08156
test on Acevedo, acc mean : 0.878911, acc std : 0.003489
test on SYSU3H, acc mean : 0.135347, acc std : 0.05614


In [78]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print('mlp :')
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        scaler = StandardScaler()
        mlp = MLPClassifier(max_iter=1000, early_stopping=True, random_state=seed)
        mlp.fit(scaler.fit_transform(X[ds][train_index]),Y[ds][train_index])
        pred = mlp.predict(scaler.transform(X[ds][test_index]))
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = mlp.predict(scaler.transform(x[dataset == val_ds]))
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    if ds != 'SYSU3H':
        print("train on {} :".format(ds))
        for val_ds in Datasets:
            print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

mlp :
train on Matek :
test on Matek, acc mean : 0.95491, acc std : 0.002806
test on Acevedo, acc mean : 0.576263, acc std : 0.01604
test on SYSU3H, acc mean : 0.145619, acc std : 0.02778
train on Acevedo :
test on Matek, acc mean : 0.482853, acc std : 0.06218
test on Acevedo, acc mean : 0.893901, acc std : 0.002805
test on SYSU3H, acc mean : 0.149849, acc std : 0.06975
train on SYSU3H :
test on Matek, acc mean : 0.516155, acc std : 0.1149
test on Acevedo, acc mean : 0.287451, acc std : 0.1255
test on SYSU3H, acc mean : 0.867119, acc std : 0.06002


In [79]:
result={ds : {ds : np.zeros(5) for ds in Datasets} for ds in Datasets}
print('LogisticRegression :')
for ds in range(len(Datasets)):
    for fold, (train_index, test_index) in enumerate(kf.split(X[ds], Y[ds])):
        scaler = StandardScaler()
        lr = LogisticRegression(max_iter=500, random_state=seed)
        lr.fit(scaler.fit_transform(X[ds][train_index]),Y[ds][train_index])
        pred = lr.predict(scaler.transform(X[ds][test_index]))
        accuracy = accuracy_score(Y[ds][test_index], pred)
        result[Datasets[ds]][Datasets[ds]][fold] = accuracy
        for val_ds in range(len(Datasets)):
            if val_ds == ds:
                continue
            pred = lr.predict(scaler.transform(x[dataset == val_ds]))
            accuracy = accuracy_score(y[dataset == val_ds], pred)
            result[Datasets[ds]][Datasets[val_ds]][fold] = accuracy

    
for ds in Datasets:
    if ds != 'SYSU3H':
        print("train on {} :".format(ds))
        for val_ds in Datasets:
            print("test on {}, acc mean : {:.6}, acc std : {:.4}".format(val_ds, result[ds][val_ds].mean(), result[ds][val_ds].std()))

LogisticRegression :
train on Matek :
test on Matek, acc mean : 0.950657, acc std : 0.003232
test on Acevedo, acc mean : 0.571326, acc std : 0.03908
test on SYSU3H, acc mean : 0.152266, acc std : 0.06587
train on Acevedo :
test on Matek, acc mean : 0.428755, acc std : 0.1098
test on Acevedo, acc mean : 0.886612, acc std : 0.002504
test on SYSU3H, acc mean : 0.236858, acc std : 0.03049
train on SYSU3H :
test on Matek, acc mean : 0.610665, acc std : 0.06695
test on Acevedo, acc mean : 0.384295, acc std : 0.06686
test on SYSU3H, acc mean : 0.933424, acc std : 0.0313
