In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
import random
from sklearn.model_selection import train_test_split
import catboost
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier


In [2]:
def get_target(df, target_name):
    return df.drop(target_name, axis=1), df[target_name]

def drop_correlated_features(df, threshold):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    df = df.drop(to_drop, axis=1)
    return df

def drop_min_rows(df, *values):
    for value in values:
        df = df.drop(df[df['DP'] == value].index, axis=0)
    return df

def balance_classes(df):
    df_balanced = pd.DataFrame()
    min_class_count = df['DP'].value_counts().min()
    for label in df['DP'].unique():
        df_label = df[df['DP'] == label]
        if len(df_label) > min_class_count:
            df_random_sample = df_label.sample(min_class_count)
            df_balanced = pd.concat([df_balanced, df_random_sample])
        else:
            df_balanced = pd.concat([df_balanced, df_label])
    return df_balanced

In [3]:
from posixpath import split
# load dataset

df = pd.read_csv("Data.csv", header = 0, delimiter = ";")
print("drop duplicates")
df = df.drop_duplicates()
constant_features = [column for column in df.columns if df[column].nunique() == 1]
print("drop constant features = ",len(constant_features))
df.drop(columns=constant_features, inplace=True)
print("split features and target")

df_features, df_target = get_target(df, 'DP')

df_corr = drop_correlated_features(df_features, 0.7)

df = pd.concat([df_corr, df_target], axis=1)
new_df = drop_min_rows(df, -60, -140)

new_df = balance_classes(new_df)

df_check = new_df.copy()

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif, f_classif, chi2

kbest = SelectKBest(k=100, score_func=mutual_info_classif)
kbest.fit(df_check.drop('DP', axis=1), df_check['DP'])
rec = kbest.get_support()
print(rec)
list_best = []
for i, r in enumerate(rec):
    if r == True:
        list_best.append(df_check.columns[i])
        print(df_check.columns[i], kbest.scores_[i])
if 'DP' in list_best:
    list_best.remove('DP')
df_best = df_check[list_best]

# check that dataframe contain column

if df_best.Disbalance.empty == None:
    df_best =  pd.concat([df_check.Disbalance, df_best], axis=1)

df_best.head()

drop duplicates
drop constant features =  1361
split features and target




[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True]
Disbalance 1.1592564516648407
ON_LINE gen 2 0.11142727182958723
QG gen 2 0.0
ON_LINE gen 3 0.11765658122824263
PG gen 5 1.0904101292552952
PG gen 27 0.39683790519373696
QG gen 27 0.5896194433973649
QN node 7005 0.6424207628951617
PN node 7008 0.3261553746309991
QN node 7011 0.29643421781077484
U node 7012 0.6330522595014685
QN node 7015 0.5513172827281587
QN node 7023 0.44131488514246175
QN node 7029 0.4385255068858527
QN node 7032 0.37485216665293253
QN node 7033 0.3252506748480464
QN node 7042 0.15607924479407398
QN node 7044 0.4356747958308338
QN node 7057 0.14818891391417788
QN node 7064 0.0
PN node 7065 0.7547507779015694
QN no

Unnamed: 0,Disbalance,ON_LINE gen 2,QG gen 2,ON_LINE gen 3,PG gen 5,PG gen 27,QG gen 27,QN node 7005,PN node 7008,QN node 7011,...,QN node 7776,QN node 7819,QN node 7821,PN node 7829,PN node 7957,Q_BEG line 52,P_BEG line 77,Q_BEG line 207,P_BEG line 220,P_BEG line 317
5,143.14,0,0.0,0,0.0,0.0,0.0,0.660812,5.126475,0.48451,...,1.131414,0.0,0.180033,0.336362,0.0,0.08,-0.06,0.0,0.0,0.81
15,142.77,0,0.0,0,0.0,0.0,0.0,0.625786,5.078537,0.429343,...,1.121359,0.0,0.229031,0.326488,0.0,0.09,-0.24,0.0,0.0,0.49
16,142.6,0,0.0,0,0.0,0.0,0.0,0.703919,5.017258,0.473474,...,1.121407,0.0,0.227,0.326575,0.0,0.09,-0.21,0.0,0.0,0.0
18,143.2,0,0.0,0,0.0,0.0,0.0,0.619212,5.051795,0.423491,...,1.101456,0.0,0.0,0.325609,0.0,0.08,-0.3,0.0,0.0,0.0
19,142.49,0,0.0,0,0.0,0.0,0.0,0.685193,5.116674,0.466858,...,1.101435,0.0,0.0,0.297376,0.0,0.08,-0.26,0.0,0.0,0.0


In [4]:
data_labels = df_check["DP"]
data_labels = pd.factorize(data_labels)[0]
data_features = df_best

train_features, test_features, train_labels, test_labels = train_test_split(data_features, data_labels, test_size = 0.2, random_state = 42)

In [5]:
cbmodel = catboost.CatBoostClassifier()
cbmodel.fit(train_features, train_labels)
cb_res = cbmodel.predict(test_features)
# accuracy
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, cb_res)

Learning rate set to 0.08109
0:	learn: 1.6075238	total: 52.5ms	remaining: 52.4s
1:	learn: 1.4497544	total: 57.6ms	remaining: 28.7s
2:	learn: 1.3457059	total: 62.6ms	remaining: 20.8s
3:	learn: 1.2547456	total: 67.5ms	remaining: 16.8s
4:	learn: 1.1586164	total: 72.7ms	remaining: 14.5s
5:	learn: 1.0840151	total: 77.7ms	remaining: 12.9s
6:	learn: 1.0077641	total: 82.7ms	remaining: 11.7s
7:	learn: 0.9447711	total: 87.9ms	remaining: 10.9s
8:	learn: 0.8882339	total: 93ms	remaining: 10.2s
9:	learn: 0.8409372	total: 98.1ms	remaining: 9.71s
10:	learn: 0.7945836	total: 103ms	remaining: 9.27s
11:	learn: 0.7525527	total: 108ms	remaining: 8.9s
12:	learn: 0.7160359	total: 113ms	remaining: 8.61s
13:	learn: 0.6829514	total: 118ms	remaining: 8.33s
14:	learn: 0.6552905	total: 124ms	remaining: 8.11s
15:	learn: 0.6307756	total: 129ms	remaining: 7.91s
16:	learn: 0.6060594	total: 134ms	remaining: 7.72s
17:	learn: 0.5810322	total: 139ms	remaining: 7.57s
18:	learn: 0.5600913	total: 144ms	remaining: 7.41s
19:	l

0.9308641975308642

In [7]:
xgbmodel = XGBClassifier()
xgbmodel.fit(train_features, train_labels)
xgb_res = xgbmodel.predict(test_features)
# accuracy
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, xgb_res)

0.928395061728395

In [8]:
rfmodel = RandomForestClassifier()
rfmodel.fit(train_features, train_labels)
rf_res = rfmodel.predict(test_features)
# accuracy
from sklearn.metrics import accuracy_score
accuracy_score(test_labels, rf_res)

0.8987654320987655