In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
import random
from sklearn.model_selection import train_test_split
import catboost
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
def get_target(df, target_name):
    return df.drop(target_name, axis=1), df[target_name]

def drop_correlated_features(df, threshold):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    df = df.drop(to_drop, axis=1)
    return df

def drop_min_rows(df, *values):
    for value in values:
        df = df.drop(df[df['DP'] == value].index, axis=0)
    return df

def balance_classes(df):
    df_balanced = pd.DataFrame()
    min_class_count = df['DP'].value_counts().min()
    for label in df['DP'].unique():
        df_label = df[df['DP'] == label]
        if len(df_label) > min_class_count:
            df_random_sample = df_label.sample(min_class_count)
            df_balanced = pd.concat([df_balanced, df_random_sample])
        else:
            df_balanced = pd.concat([df_balanced, df_label])
    return df_balanced

In [3]:
from posixpath import split
# load dataset

df = pd.read_csv("Data.csv", header = 0, delimiter = ";")
print("drop duplicates")
df = df.drop_duplicates()
constant_features = [column for column in df.columns if df[column].nunique() == 1]
print("drop constant features = ",len(constant_features))
df.drop(columns=constant_features, inplace=True)
print("split features and target")

df_features, df_target = get_target(df, 'DP')

df_corr = drop_correlated_features(df_features, 0.7)

df = pd.concat([df_corr, df_target], axis=1)
new_df = drop_min_rows(df, -60, -140)

#new_df = balance_classes(new_df)

df_check = new_df.copy()

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif, f_classif, chi2

kbest = SelectKBest(k=100, score_func=mutual_info_classif)
kbest.fit(df_check.drop('DP', axis=1), df_check['DP'])
rec = kbest.get_support()
print(rec)
list_best = []
for i, r in enumerate(rec):
    if r == True:
        list_best.append(df_check.columns[i])
        print(df_check.columns[i], kbest.scores_[i])
if 'DP' in list_best:
    list_best.remove('DP')
df_best = df_check[list_best]

# check that dataframe contain column

if df_best.Disbalance.empty == None:
    df_best =  pd.concat([df_check.Disbalance, df_best], axis=1)

df_best.head()

drop duplicates
drop constant features =  1361
split features and target




[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True]
Disbalance 1.0053707474173426
ON_LINE gen 2 0.06866449924059959
QG gen 2 0.0
ON_LINE gen 3 0.1200998503348969
PG gen 5 0.8577692647208106
PG gen 27 0.34969837513032154
QG gen 27 0.44326125091079316
QN node 7005 0.6322224800673988
PN node 7008 0.3648131627240554
QN node 7011 0.3619341280253239
U node 7012 0.544009880352178
QN node 7015 0.47619871892235666
QN node 7023 0.43204783062132623
QN node 7029 0.40558492104162847
QN node 7032 0.42142668024644614
QN node 7033 0.37883926251885613
QN node 7042 0.14544163700533774
QN node 7044 0.4606344287903221
QN node 7057 0.1610836590799538
QN node 7064 0.0
PN node 7065 0.6686616234262213
QN no

Unnamed: 0,Disbalance,ON_LINE gen 2,QG gen 2,ON_LINE gen 3,PG gen 5,PG gen 27,QG gen 27,QN node 7005,PN node 7008,QN node 7011,...,QN node 7776,QN node 7819,QN node 7821,PN node 7829,PN node 7957,Q_BEG line 52,P_BEG line 77,Q_BEG line 207,P_BEG line 220,P_BEG line 317
5,143.14,0,0.0,0,0.0,0.0,0.0,0.660812,5.126475,0.48451,...,1.131414,0.0,0.180033,0.336362,0.0,0.08,-0.06,0.0,0.0,0.81
15,142.77,0,0.0,0,0.0,0.0,0.0,0.625786,5.078537,0.429343,...,1.121359,0.0,0.229031,0.326488,0.0,0.09,-0.24,0.0,0.0,0.49
16,142.6,0,0.0,0,0.0,0.0,0.0,0.703919,5.017258,0.473474,...,1.121407,0.0,0.227,0.326575,0.0,0.09,-0.21,0.0,0.0,0.0
18,143.2,0,0.0,0,0.0,0.0,0.0,0.619212,5.051795,0.423491,...,1.101456,0.0,0.0,0.325609,0.0,0.08,-0.3,0.0,0.0,0.0
19,142.49,0,0.0,0,0.0,0.0,0.0,0.685193,5.116674,0.466858,...,1.101435,0.0,0.0,0.297376,0.0,0.08,-0.26,0.0,0.0,0.0


In [4]:
data_labels = df_check["DP"]
data_labels = pd.factorize(data_labels)[0]
data_features = df_best

train_features, test_features, train_labels, test_labels = train_test_split(data_features, data_labels, test_size = 0.2, random_state = 42)

In [5]:
cbmodel = catboost.CatBoostClassifier()
cbmodel.fit(train_features, train_labels)
cb_res = cbmodel.predict(test_features)
# accuracy
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, cb_res)

Learning rate set to 0.084868
0:	learn: 1.5334475	total: 53.6ms	remaining: 53.6s
1:	learn: 1.3557277	total: 59.4ms	remaining: 29.6s
2:	learn: 1.2223887	total: 65ms	remaining: 21.6s
3:	learn: 1.1084779	total: 70.4ms	remaining: 17.5s
4:	learn: 1.0123947	total: 75.8ms	remaining: 15.1s
5:	learn: 0.9360690	total: 81.2ms	remaining: 13.4s
6:	learn: 0.8649134	total: 86.4ms	remaining: 12.3s
7:	learn: 0.8090778	total: 92ms	remaining: 11.4s
8:	learn: 0.7569580	total: 97.1ms	remaining: 10.7s
9:	learn: 0.7100599	total: 102ms	remaining: 10.1s
10:	learn: 0.6717600	total: 108ms	remaining: 9.74s
11:	learn: 0.6354004	total: 114ms	remaining: 9.38s
12:	learn: 0.6011819	total: 119ms	remaining: 9.05s
13:	learn: 0.5699895	total: 125ms	remaining: 8.78s
14:	learn: 0.5445361	total: 130ms	remaining: 8.52s
15:	learn: 0.5196876	total: 135ms	remaining: 8.29s
16:	learn: 0.4987764	total: 140ms	remaining: 8.12s
17:	learn: 0.4790095	total: 146ms	remaining: 7.97s
18:	learn: 0.4601861	total: 151ms	remaining: 7.81s
19:	le

0.9564777327935222

In [6]:
xgbmodel = XGBClassifier()
xgbmodel.fit(train_features, train_labels)
xgb_res = xgbmodel.predict(test_features)
# accuracy
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, xgb_res)

0.9605263157894737

In [7]:
rfmodel = RandomForestClassifier()
rfmodel.fit(train_features, train_labels)
rf_res = rfmodel.predict(test_features)
# accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(test_labels, rf_res))
#f1-score
from sklearn.metrics import f1_score
print(f1_score(test_labels, rf_res, average='weighted'))

0.951417004048583
0.9510749043465145


In [14]:
# grid search

from sklearn.model_selection import GridSearchCV

#RandomForest
grid = { "n_estimators" : [10, 25, 50, 75, 100, 200],
        "criterion" : ["gini", "entropy"],
        "max_depth" : [5, 10, 15, 20, 25],
        "min_samples_split" : [2, 3, 4, 5, 6, 7, 8, 9, 10],
        "min_samples_leaf" : [1,2, 3, 4, 5, 10]
        }

rf = RandomForestClassifier()

grid_search = GridSearchCV(estimator = rf, param_grid = grid, cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(train_features, train_labels)

grid_search.best_params_

rf.res = grid_search.best_estimator_.predict(test_features)

accuracy_score(test_labels, rf.res)

Fitting 3 folds for each of 3240 candidates, totalling 9720 fits
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=25; total time=   0.1s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=25; total time=   0.1s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=25; total time=   0.1s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END criterion=gini, max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END criter

0.958502024291498

In [9]:
# grid search for catboost

grid = {
        "n_estimators" : [10, 50, 100],
        #"max_depth" : [15],
        "learning_rate" : [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3],
        }

cb = catboost.CatBoostClassifier()

grid_search = GridSearchCV(estimator = cb, param_grid = grid, cv = 2,  verbose = 2)

grid_search.fit(train_features, train_labels)

grid_search.best_params_

cb.res = grid_search.best_estimator_.predict(test_features)

accuracy_score(test_labels, cb.res)


Fitting 2 folds for each of 18 candidates, totalling 36 fits
0:	learn: 1.7915029	total: 6.88ms	remaining: 61.9ms
1:	learn: 1.7912132	total: 11.9ms	remaining: 47.6ms
2:	learn: 1.7909172	total: 17ms	remaining: 39.6ms
3:	learn: 1.7906576	total: 22.2ms	remaining: 33.3ms
4:	learn: 1.7903806	total: 27.3ms	remaining: 27.3ms
5:	learn: 1.7901098	total: 32.5ms	remaining: 21.7ms
6:	learn: 1.7898142	total: 37.8ms	remaining: 16.2ms
7:	learn: 1.7895360	total: 42.6ms	remaining: 10.7ms
8:	learn: 1.7892607	total: 47.3ms	remaining: 5.26ms
9:	learn: 1.7889812	total: 52.4ms	remaining: 0us
[CV] END ..............learning_rate=0.0001, n_estimators=10; total time=   0.1s
0:	learn: 1.7914779	total: 5.94ms	remaining: 53.5ms
1:	learn: 1.7911912	total: 11ms	remaining: 44.2ms
2:	learn: 1.7909011	total: 16ms	remaining: 37.2ms
3:	learn: 1.7905814	total: 20.9ms	remaining: 31.4ms
4:	learn: 1.7902884	total: 26.2ms	remaining: 26.2ms
5:	learn: 1.7899972	total: 31.4ms	remaining: 20.9ms
6:	learn: 1.7897139	total: 36.2ms	r

0.9483805668016194

In [10]:
# grid search for xgboost

grid = { "n_estimators" : [10, 50, 100],
        "max_depth" : [5, 10, 15],
        "min_child_weight" : [1, 3, 5],
        "gamma" : [0.0, 0.1, 0.2],
        "colsample_bytree" : [0.3, 0.4, 0.5],
        "subsample" : [0.8, 0.9, 1.0],
        }

xgb = XGBClassifier()

grid_search = GridSearchCV(estimator = xgb, param_grid = grid, cv = 3, verbose = 2)

grid_search.fit(train_features, train_labels)

print(grid_search.best_params_)

print(grid_search.best_estimator_)

xgb.res = grid_search.best_estimator_.predict(test_features)

accuracy_score(test_labels, xgb.res)

Fitting 3 folds for each of 729 candidates, totalling 2187 fits
[CV] END colsample_bytree=0.3, gamma=0.0, max_depth=5, min_child_weight=1, n_estimators=10, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.3, gamma=0.0, max_depth=5, min_child_weight=1, n_estimators=10, subsample=0.8; total time=   0.0s
[CV] END colsample_bytree=0.3, gamma=0.0, max_depth=5, min_child_weight=1, n_estimators=10, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.3, gamma=0.0, max_depth=5, min_child_weight=1, n_estimators=10, subsample=0.9; total time=   0.0s
[CV] END colsample_bytree=0.3, gamma=0.0, max_depth=5, min_child_weight=1, n_estimators=10, subsample=0.9; total time=   0.0s
[CV] END colsample_bytree=0.3, gamma=0.0, max_depth=5, min_child_weight=1, n_estimators=10, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.3, gamma=0.0, max_depth=5, min_child_weight=1, n_estimators=10, subsample=1.0; total time=   0.1s
[CV] END colsample_bytree=0.3, gamma=0.0, max_depth=5,

0.9615384615384616

In [11]:
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

grid = { "n_estimators" : [10, 50, 100],}

ada = AdaBoostClassifier()

grid_search = GridSearchCV(estimator = ada, param_grid = grid, cv = 3, verbose = 2)

grid_search.fit(train_features, train_labels)

grid_search.best_params_

ada.res = grid_search.best_estimator_.predict(test_features)

accuracy_score(test_labels, ada.res)




Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ....................................n_estimators=10; total time=   0.1s
[CV] END ....................................n_estimators=10; total time=   0.1s
[CV] END ....................................n_estimators=10; total time=   0.1s




[CV] END ....................................n_estimators=50; total time=   0.3s




[CV] END ....................................n_estimators=50; total time=   0.3s




[CV] END ....................................n_estimators=50; total time=   0.3s




[CV] END ...................................n_estimators=100; total time=   0.6s




[CV] END ...................................n_estimators=100; total time=   0.6s




[CV] END ...................................n_estimators=100; total time=   0.6s




0.6406882591093117