In [2]:
import pandas as pd
import numpy as np
import sklearn
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import ConfusionMatrixDisplay

df = pd.read_csv('../data/weblogs.csv')

In [3]:
df = df.drop(['ID'], axis=1)

X = df.loc[:, df.columns != 'ROBOT']
y = df['ROBOT']

X = X.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)

bag_clf.fit(X_train, y_train)

y_pred = bag_clf.predict(X_test)

In [5]:
print(accuracy_score(y_test, y_pred))

0.9495211936753025


In [6]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, oob_score=True, n_jobs=-1, random_state=42)

bag_clf.fit(X_train, y_train)

bag_clf.oob_score_

0.9510402553775914

In [7]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9495211936753025

In [8]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97     10785
           1       0.84      0.92      0.88      2686

    accuracy                           0.95     13471
   macro avg       0.91      0.94      0.92     13471
weighted avg       0.95      0.95      0.95     13471



In [9]:
features = X.columns

dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

importances = dt_clf.feature_importances_

df_importances = pd.DataFrame({'feature': features, 'importance': importances})
df_importances = df_importances.sort_values(by='importance', ascending=False)
df_importances

Unnamed: 0,feature,importance
14,UNASSIGNED,0.68898
24,DATA,0.10087
25,PPI,0.039181
29,PENALTY,0.035098
19,HTML_TO_JS,0.025089
5,HTTP_RESPONSE_2XX,0.021339
16,TOTAL_HTML,0.013999
21,DEPTH,0.012293
2,AVERAGE_TIME,0.011318
1,TOTAL_DURATION,0.009855


OTHER_METHOD en NIGHT had ik terug toegevoegd (normaal had ik ze eerst gedropt vooraleer ze te testen) --> OTHER_MEHTOD kan weggelaten worden (geen feature importance) NIGHT ga ik wel terug toevoegen

### TEST

alle features droppen die minder dan 0.001 feature importance hebben

In [10]:
df = pd.read_csv('../data/weblogs.csv')

In [11]:
df = df.drop(['ID', 'OTHER_METHOD', 'SF_REFERRER', 'REPEATED_REQUESTS', 'HEAD_METHOD', 'POST_METHOD', 'HTTP_RESPONSE_4XX', 'HTTP_RESPONSE_3XX', 'HTTP_RESPONSE_5XX'], axis=1)

In [12]:
X = df.loc[:, df.columns != 'ROBOT']
y = df['ROBOT']

X = X.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)

bag_clf.fit(X_train, y_train)

y_pred = bag_clf.predict(X_test)

In [14]:
print(accuracy_score(y_test, y_pred))

0.9498181278301536


Accuracy zonder gedropte features --> verbeterd klein beetje

In [15]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, oob_score=True, n_jobs=-1, random_state=42)

bag_clf.fit(X_train, y_train)

bag_clf.oob_score_

0.9517455132607041

In [16]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9498181278301536

Hier ook accuracy verbeterd

In [17]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97     10785
           1       0.84      0.92      0.88      2686

    accuracy                           0.95     13471
   macro avg       0.91      0.94      0.92     13471
weighted avg       0.95      0.95      0.95     13471



Classification report --> precies hetzelfde --> features kunnen gedropped worden

### Features met NaN-values weglaten en testen

In [18]:
df = pd.read_csv('../data/weblogs.csv')

In [19]:
df = df.drop(['SF_FILETYPE', 'STANDARD_DEVIATION', 'ID', 'OTHER_METHOD', 'SF_REFERRER', 'REPEATED_REQUESTS', 'HEAD_METHOD', 'POST_METHOD', 'HTTP_RESPONSE_4XX', 'HTTP_RESPONSE_3XX', 'HTTP_RESPONSE_5XX'], axis=1)

In [20]:
X = df.loc[:, df.columns != 'ROBOT']
y = df['ROBOT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)

bag_clf.fit(X_train, y_train)

y_pred = bag_clf.predict(X_test)

In [22]:
print(accuracy_score(y_test, y_pred))

0.9501892955237177


Accuracy --> alweer verbeterd

In [23]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, oob_score=True, n_jobs=-1, random_state=42)

bag_clf.fit(X_train, y_train)

bag_clf.oob_score_

0.951912548022494

In [24]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9501892955237177

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97     10785
           1       0.84      0.92      0.88      2686

    accuracy                           0.95     13471
   macro avg       0.91      0.94      0.92     13471
weighted avg       0.95      0.95      0.95     13471



Accuracy alweer verbeterd en classification report is gelijk gebleven --> features droppen

In [26]:
features = X.columns

dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)

importances = dt_clf.feature_importances_

df_importances = pd.DataFrame({'feature': features, 'importance': importances})
df_importances = df_importances.sort_values(by='importance', ascending=False)
df_importances

Unnamed: 0,feature,importance
6,UNASSIGNED,0.689603
16,DATA,0.101103
17,PPI,0.040181
19,PENALTY,0.035089
11,HTML_TO_JS,0.025473
3,HTTP_RESPONSE_2XX,0.021193
8,TOTAL_HTML,0.014297
13,DEPTH,0.014145
2,AVERAGE_TIME,0.012834
1,TOTAL_DURATION,0.011809


### Nog eens alles weglaten onder 0.001 feature importance

In [29]:
df = pd.read_csv('../data/weblogs.csv')

X = df[['UNASSIGNED', 'DATA', 'PPI', 'PENALTY', 'HTML_TO_JS', 'HTTP_RESPONSE_2XX', 'TOTAL_HTML', 'DEPTH', 'AVERAGE_TIME', 'TOTAL_DURATION']]
y = df['ROBOT']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)

bag_clf.fit(X_train, y_train)

y_pred = bag_clf.predict(X_test)

In [33]:
print(accuracy_score(y_test, y_pred))

0.952861702917378


In [34]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, oob_score=True, n_jobs=-1, random_state=42)

bag_clf.fit(X_train, y_train)

bag_clf.oob_score_

0.9543067129414822

In [35]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.952861702917378

In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97     10785
           1       0.86      0.92      0.89      2686

    accuracy                           0.95     13471
   macro avg       0.92      0.94      0.93     13471
weighted avg       0.95      0.95      0.95     13471



Classification report is verbeterd!