In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import train_test_split
import xgboost as xgb
from keras.models import Sequential
from keras.layers import Dense

In [2]:
'''F1-score and accuracy calculation'''
def F1metr(x_pred, x_real): #1 - positive, O - negative
    x_pred, x_real= x_pred.astype(int), x_real.astype(int) 
    tp=len(np.where(x_pred[np.where(x_real==1)]==1)[0])
    tn=len(np.where(x_pred[np.where(x_real==0)]==0)[0])
    fp=len(np.where(x_pred[np.where(x_real==0)]==1)[0])
    fn=len(np.where(x_pred[np.where(x_real==1)]==0)[0])
    sensitivity, specificity = tp/(tp+fn)*100, fp/(tn+fp)*100 
    precision, recall = tp/(tp+fp), tp/(tp+fn) 
    accuracy=(tp+tn)/(tp+tn+fp+fn)*100
    f1=2*precision*recall/(precision+recall)
    return f1, accuracy

In [3]:
'''Artificial data for classifier training'''
table=pd.read_csv('art_table.csv')
minmape=[np.min(table.loc[i,['VARmape', 'LAmape', 'MSSAmape', 'Choomape', 'RNNmape']]) for i in table.index] # mean value is also possible
table['minmape']=minmape
table.describe()

Unnamed: 0,VARmape,LAmape,MSSAmape,Choomape,Grent,Connect,Cycles,Noise,Hurst,KSent,Rndwl,Corent,cluster,RNNmape,minmape
count,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0,496.0
mean,16.247069,36.843215,17.772578,17.183109,4.192181,1.379169,0.489633,0.759498,0.326964,8.363663,0.191704,4.169677,0.915323,1000.0,13.342229
std,12.217725,35.15195,14.210663,14.044742,0.447777,1.091023,0.343551,0.23354,0.101934,0.416684,0.25872,0.36703,0.278682,0.0,10.446661
min,8e-05,0.018028,2e-06,0.007339,2.763216,0.5,0.0,0.129861,0.118075,3.977292,9.1e-05,2.05287,0.0,1000.0,2e-06
25%,7.023899,11.738704,7.967576,6.777824,3.889845,1.0,0.136364,0.648373,0.250079,8.319765,0.028446,4.082395,1.0,1000.0,5.493776
50%,13.628993,24.937368,14.300662,12.812552,4.201027,1.041953,0.469337,0.858568,0.310647,8.45663,0.054423,4.267596,1.0,1000.0,10.552657
75%,21.39672,52.19239,22.437465,23.940815,4.511843,1.188429,0.802534,0.937721,0.386223,8.559508,0.269738,4.393322,1.0,1000.0,18.176032
max,62.270306,182.069688,108.345799,76.935189,5.236903,6.699605,0.99999,0.992865,0.70961,8.956106,0.980852,4.595789,1.0,1000.0,54.759822


In [4]:
'''Real-world data for classification'''
real=pd.read_csv('real01_table.csv') 
minmape=[np.min(real.loc[i,['VARmape', 'LAmape', 'MSSAmape', 'Choomape', 'RNNmape']]) for i in real.index] # mean value is also possible
real['minmape']=minmape
real.describe()

Unnamed: 0,VARmape,LAmape,MSSAmape,Choomape,RNNmape,Gsize,Grent,Connect,Assort,Density,Modularity,Cycles,Noise,Hurst,KSent,Rndwl,Corent,minmape
count,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,33.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0,34.0
mean,16.305998,25.459691,29.161018,19.959419,21.504847,2.270284,4.052988,0.83211,0.409247,0.150091,0.165422,0.211914,0.86379,0.542813,7.354636,0.196571,3.75961,14.750074
std,13.060365,18.450401,24.022203,14.345258,15.955981,0.237959,0.429869,0.199961,0.183165,0.054158,0.084103,0.234374,0.189134,0.143411,0.911966,0.162093,0.925229,11.493582
min,1.228288,1.718927,2.990121,1.300883,3.888614,1.592593,3.274947,0.511111,-0.121212,0.061254,0.037037,0.0,0.218597,0.265756,4.601259,0.002347,0.735877,1.228288
25%,4.586335,6.636961,8.986561,7.650159,10.459936,2.198317,3.700884,0.664822,0.358969,0.114881,0.101316,0.051389,0.877696,0.463783,7.202499,0.044981,3.573111,4.561282
50%,15.536286,24.134909,24.950528,18.915581,18.620247,2.313393,4.108253,0.852556,0.427006,0.134804,0.14214,0.095455,0.928284,0.55346,7.766393,0.149122,4.175712,13.833303
75%,21.203479,39.618555,41.88754,28.801386,28.01768,2.356092,4.362054,1.004931,0.486682,0.194202,0.232026,0.353383,0.974904,0.611644,7.921549,0.336209,4.297254,19.197387
max,54.560056,68.539261,110.004362,52.74009,68.247101,2.782609,4.892588,1.183794,0.776971,0.277778,0.333333,0.934857,0.989945,0.952584,8.138675,0.600256,4.476376,46.820455


In [5]:
'''k-nearest neighbours'''
s='g' # s - for series features, g - for graph characteristics
bound = 20
table['Good']=(table.minmape<bound).astype(int)
table['Bad']=(bound <= table.minmape).astype(int)

feat=['Grent', 'Connect', 'Cycles'] if s=='g' else ['Noise', 'Hurst', 'KSent', 'Rndwl', 'Corent']
res=['Good','Bad']
XX=table[feat]
yy=table[res]
X, Xt, y,  yt = train_test_split(XX, yy, test_size=0.33)#, random_state=42)
print(X.shape, Xt.shape, y.shape,  yt.shape)

neigh = KNN(n_neighbors=3)
neigh.fit(X, y)
# Xt=table[feat]
# yt=table[res]
trsc=neigh.score(Xt, yt)*100

real['Good']=(real.minmape < bound).astype(int)
real['Bad']=(real.minmape >= bound).astype(int)
Xr=real[feat]
yr=real[res]
resc=neigh.score(Xr, yr)*100

print('Test accuracy = %.2f%%; Real accuracy =  %.2f%%'%(trsc, resc))
print('F1 score = %.2f'%F1metr(neigh.predict(Xr), yr)[0])
del neigh

(332, 3) (164, 3) (332, 2) (164, 2)
Test accuracy = 74.39%; Real accuracy =  64.71%
F1 score = 0.65


In [6]:
'''XGBOOST'''
res=['Good']
XX=table[feat]
yy=table[res]
X, Xt, y,  yt = train_test_split(XX, yy, test_size=0.33)#, random_state=42)

print(X.shape, Xt.shape, y.shape,  yt.shape)
dtrain = xgb.DMatrix(X.values, label=y.values)
dtest = xgb.DMatrix(Xt.values)
params = {'objective': 'reg:squarederror','booster':'gblinear'}
trees = 1000
cv = xgb.cv(params, dtrain, metrics = ('rmse'), verbose_eval=False, nfold=20, show_stdv=True, num_boost_round=trees)
bst = xgb.train(params, dtrain, num_boost_round=cv['test-rmse-mean'].idxmin())
prediction_test = bst.predict(dtest) 
acc1=100-sum(abs(prediction_test.round(0)-yt.values[:,0]))/yt.shape[0]*100

Xr=real[feat].values
yr=real['Good'].values
dreal=xgb.DMatrix(Xr)
prediction_real = bst.predict(dreal) 
acc2=100-sum(abs(prediction_real.round(0)-yr))/yr.shape[0]*100
print('Test accuracy = %.2f%%; Real accuracy =  %.2f%%'%(acc1, acc2))
print('F1 score = %.2f'%F1metr(prediction_real.round(0), yr)[0])


(332, 3) (164, 3) (332, 1) (164, 1)
Test accuracy = 80.49%; Real accuracy =  76.47%
F1 score = 0.87


In [7]:
'''Neural network classifier'''
res=['Good', 'Bad']

XX=table[feat]
yy=table[res]
X, Xt, y,  yt = train_test_split(XX, yy, test_size=0.33)

model = Sequential()
model.add(Dense(len(feat), activation='sigmoid'))
model.add(Dense(len(feat)*3, activation='tanh'))
model.add(Dense(len(feat)*3, activation='sigmoid'))
model.add(Dense(len(res), activation='softmax'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, validation_data=(Xt, yt), epochs=300, batch_size=None, shuffle=True, verbose=0)
scores = model.evaluate(Xt, yt)
prediction_test=model.predict(Xt)
acc1=100-sum(abs(prediction_test[:,0].round(0)-yt.values[:,0]))/yt.shape[0]*100

Xr=real[feat].values
yr=real['Good'].values
prediction_real = model.predict(Xr) 
acc2=100-sum(abs(prediction_real[:,0].round(0)-yr))/yr.shape[0]*100

print('Test accuracy = %.2f%%; Real accuracy =  %.2f%%'%(acc1, acc2))
print('F1 score = %.2f'%F1metr(prediction_real[:,0].round(0), yr)[0])
del model

Test accuracy = 82.32%; Real accuracy =  76.47%
F1 score = 0.87
