In [1]:
import xgboost
import numpy as np
import pandas as pd

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv("sample_flows_100k.csv")

In [3]:
def label_y(s):
    if 'anticausal' in s:
        return 0
    elif 'causal' in s:
        return 1

## Pre-processing

In [6]:
df['y'] = df['scm'].apply(label_y)
df.drop(columns=['scm'], inplace=True)

In [9]:
X = df.drop(columns=['y', 'conditional_shannon_entropy_Y|X', 'mutual_information']).values
y = df['y'].values

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Modelling

### SVM

In [14]:
clf = svm.SVC()

In [15]:
clf.fit(X_train, y_train)

SVC()

In [17]:
y_pred = clf.predict(X_test)

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.51      0.58      9938
           1       0.61      0.75      0.67     10062

    accuracy                           0.63     20000
   macro avg       0.64      0.63      0.62     20000
weighted avg       0.64      0.63      0.62     20000



### Random Forest

In [68]:
clf = RandomForestClassifier(max_depth=20, n_estimators=200)

In [69]:
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=20, n_estimators=200)

In [70]:
y_pred = clf.predict(X_test)

In [71]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.54      0.61      9938
           1       0.63      0.76      0.69     10062

    accuracy                           0.65     20000
   macro avg       0.66      0.65      0.65     20000
weighted avg       0.66      0.65      0.65     20000



### XGBoost

In [74]:
D_train = xgboost.DMatrix(X_train, label=y_train)
D_test = xgboost.DMatrix(X_test, label=y_test)

In [91]:
clf = xgboost.XGBClassifier()

parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20],
     "max_depth"        : [3, 4, 5, 6, 8, 10],
     "min_child_weight" : [1, 3, 5],
     "gamma"            : [0.0, 0.1, 0.2, 0.3],
     "colsample_bytree" : [0.3, 0.4, 0.5]
}

grid = GridSearchCV(clf,
                    parameters, n_jobs=6,
                    scoring="neg_log_loss",
                    cv=4)

grid.fit(X_train, y_train)

GridSearchCV(cv=4,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs...
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, va

In [92]:
y_pred = grid.predict(X_test)

In [95]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.54      0.61      9938
           1       0.63      0.76      0.69     10062

    accuracy                           0.65     20000
   macro avg       0.66      0.65      0.65     20000
weighted avg       0.66      0.65      0.65     20000



###  Keras

In [192]:
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical

tf.keras.backend.set_floatx('float64')

In [193]:
y_train_cat = to_categorical(y_train)
y_test_cat = to_categorical(y_test)

In [194]:
model = Sequential([
    Dense(128, activation='relu'),
    Dropout(0.15),
    Dense(256, activation='relu'),
    Dropout(0.15),
    Dense(128, activation='relu'),
    Dense(2, activation='sigmoid')
])

In [195]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [196]:
model.fit(X_train, y_train_cat, epochs=400, validation_split=0.1, batch_size=256, verbose=2)

Epoch 1/400
282/282 - 0s - loss: 0.6696 - accuracy: 0.5779 - val_loss: 0.6440 - val_accuracy: 0.6105
Epoch 2/400
282/282 - 0s - loss: 0.6387 - accuracy: 0.6112 - val_loss: 0.6243 - val_accuracy: 0.6209
Epoch 3/400
282/282 - 0s - loss: 0.6251 - accuracy: 0.6236 - val_loss: 0.6135 - val_accuracy: 0.6238
Epoch 4/400
282/282 - 0s - loss: 0.6166 - accuracy: 0.6263 - val_loss: 0.6099 - val_accuracy: 0.6295
Epoch 5/400
282/282 - 0s - loss: 0.6098 - accuracy: 0.6299 - val_loss: 0.6048 - val_accuracy: 0.6342
Epoch 6/400
282/282 - 0s - loss: 0.6046 - accuracy: 0.6335 - val_loss: 0.5995 - val_accuracy: 0.6359
Epoch 7/400
282/282 - 0s - loss: 0.6003 - accuracy: 0.6375 - val_loss: 0.5977 - val_accuracy: 0.6305
Epoch 8/400
282/282 - 0s - loss: 0.5991 - accuracy: 0.6378 - val_loss: 0.5983 - val_accuracy: 0.6362
Epoch 9/400
282/282 - 0s - loss: 0.5969 - accuracy: 0.6371 - val_loss: 0.5905 - val_accuracy: 0.6376
Epoch 10/400
282/282 - 0s - loss: 0.5945 - accuracy: 0.6408 - val_loss: 0.5903 - val_accura

Epoch 82/400
282/282 - 0s - loss: 0.5754 - accuracy: 0.6510 - val_loss: 0.5763 - val_accuracy: 0.6492
Epoch 83/400
282/282 - 0s - loss: 0.5755 - accuracy: 0.6508 - val_loss: 0.5772 - val_accuracy: 0.6496
Epoch 84/400
282/282 - 0s - loss: 0.5755 - accuracy: 0.6523 - val_loss: 0.5768 - val_accuracy: 0.6476
Epoch 85/400
282/282 - 0s - loss: 0.5752 - accuracy: 0.6506 - val_loss: 0.5786 - val_accuracy: 0.6476
Epoch 86/400
282/282 - 0s - loss: 0.5754 - accuracy: 0.6502 - val_loss: 0.5766 - val_accuracy: 0.6466
Epoch 87/400
282/282 - 0s - loss: 0.5761 - accuracy: 0.6517 - val_loss: 0.5762 - val_accuracy: 0.6480
Epoch 88/400
282/282 - 0s - loss: 0.5751 - accuracy: 0.6508 - val_loss: 0.5762 - val_accuracy: 0.6464
Epoch 89/400
282/282 - 0s - loss: 0.5761 - accuracy: 0.6506 - val_loss: 0.5796 - val_accuracy: 0.6488
Epoch 90/400
282/282 - 0s - loss: 0.5758 - accuracy: 0.6494 - val_loss: 0.5764 - val_accuracy: 0.6478
Epoch 91/400
282/282 - 0s - loss: 0.5753 - accuracy: 0.6514 - val_loss: 0.5753 - v

Epoch 162/400
282/282 - 0s - loss: 0.5709 - accuracy: 0.6519 - val_loss: 0.5740 - val_accuracy: 0.6502
Epoch 163/400
282/282 - 0s - loss: 0.5708 - accuracy: 0.6524 - val_loss: 0.5749 - val_accuracy: 0.6515
Epoch 164/400
282/282 - 0s - loss: 0.5717 - accuracy: 0.6542 - val_loss: 0.5737 - val_accuracy: 0.6491
Epoch 165/400
282/282 - 0s - loss: 0.5708 - accuracy: 0.6514 - val_loss: 0.5753 - val_accuracy: 0.6472
Epoch 166/400
282/282 - 0s - loss: 0.5716 - accuracy: 0.6534 - val_loss: 0.5738 - val_accuracy: 0.6498
Epoch 167/400
282/282 - 0s - loss: 0.5707 - accuracy: 0.6537 - val_loss: 0.5751 - val_accuracy: 0.6454
Epoch 168/400
282/282 - 0s - loss: 0.5704 - accuracy: 0.6527 - val_loss: 0.5747 - val_accuracy: 0.6479
Epoch 169/400
282/282 - 0s - loss: 0.5699 - accuracy: 0.6535 - val_loss: 0.5762 - val_accuracy: 0.6480
Epoch 170/400
282/282 - 0s - loss: 0.5707 - accuracy: 0.6520 - val_loss: 0.5768 - val_accuracy: 0.6476
Epoch 171/400
282/282 - 0s - loss: 0.5703 - accuracy: 0.6526 - val_loss: 

Epoch 242/400
282/282 - 0s - loss: 0.5682 - accuracy: 0.6542 - val_loss: 0.5726 - val_accuracy: 0.6495
Epoch 243/400
282/282 - 0s - loss: 0.5689 - accuracy: 0.6530 - val_loss: 0.5725 - val_accuracy: 0.6450
Epoch 244/400
282/282 - 0s - loss: 0.5677 - accuracy: 0.6528 - val_loss: 0.5725 - val_accuracy: 0.6491
Epoch 245/400
282/282 - 0s - loss: 0.5681 - accuracy: 0.6544 - val_loss: 0.5733 - val_accuracy: 0.6490
Epoch 246/400
282/282 - 0s - loss: 0.5683 - accuracy: 0.6543 - val_loss: 0.5716 - val_accuracy: 0.6491
Epoch 247/400
282/282 - 0s - loss: 0.5680 - accuracy: 0.6531 - val_loss: 0.5701 - val_accuracy: 0.6531
Epoch 248/400
282/282 - 0s - loss: 0.5679 - accuracy: 0.6531 - val_loss: 0.5712 - val_accuracy: 0.6476
Epoch 249/400
282/282 - 0s - loss: 0.5687 - accuracy: 0.6531 - val_loss: 0.5735 - val_accuracy: 0.6489
Epoch 250/400
282/282 - 0s - loss: 0.5684 - accuracy: 0.6531 - val_loss: 0.5711 - val_accuracy: 0.6502
Epoch 251/400
282/282 - 0s - loss: 0.5679 - accuracy: 0.6528 - val_loss: 

Epoch 322/400
282/282 - 0s - loss: 0.5662 - accuracy: 0.6541 - val_loss: 0.5713 - val_accuracy: 0.6458
Epoch 323/400
282/282 - 0s - loss: 0.5659 - accuracy: 0.6556 - val_loss: 0.5723 - val_accuracy: 0.6476
Epoch 324/400
282/282 - 0s - loss: 0.5670 - accuracy: 0.6536 - val_loss: 0.5714 - val_accuracy: 0.6548
Epoch 325/400
282/282 - 0s - loss: 0.5655 - accuracy: 0.6549 - val_loss: 0.5704 - val_accuracy: 0.6538
Epoch 326/400
282/282 - 0s - loss: 0.5656 - accuracy: 0.6540 - val_loss: 0.5697 - val_accuracy: 0.6508
Epoch 327/400
282/282 - 0s - loss: 0.5660 - accuracy: 0.6551 - val_loss: 0.5748 - val_accuracy: 0.6509
Epoch 328/400
282/282 - 0s - loss: 0.5664 - accuracy: 0.6544 - val_loss: 0.5740 - val_accuracy: 0.6490
Epoch 329/400
282/282 - 0s - loss: 0.5659 - accuracy: 0.6550 - val_loss: 0.5715 - val_accuracy: 0.6501
Epoch 330/400
282/282 - 0s - loss: 0.5660 - accuracy: 0.6549 - val_loss: 0.5711 - val_accuracy: 0.6509
Epoch 331/400
282/282 - 0s - loss: 0.5663 - accuracy: 0.6536 - val_loss: 

<tensorflow.python.keras.callbacks.History at 0x7f9fe005e490>

In [197]:
pred_test = model.predict(X_test)
y_classes = pred_test.argmax(axis=-1)

In [198]:
print(classification_report(y_test, y_classes))

              precision    recall  f1-score   support

           0       0.68      0.60      0.63      9938
           1       0.64      0.72      0.68     10062

    accuracy                           0.66     20000
   macro avg       0.66      0.66      0.66     20000
weighted avg       0.66      0.66      0.66     20000

