In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [27]:
df = pd.read_csv('../data/weblogs.csv')

df = df.drop(['NIGHT', 'ID', 'OTHER_METHOD'], axis=1)

X = df.loc[:, df.columns != 'ROBOT']
y = df['ROBOT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [28]:
param_grid = {
    'C': [0.1, 1, 10, 20, 30, 100],
}

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('grid_search', GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5, n_jobs=-1, verbose=3))
])

In [29]:
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
              precision    recall  f1-score   support

           0       0.98      0.95      0.96     10785
           1       0.81      0.94      0.87      2686

    accuracy                           0.94     13471
   macro avg       0.90      0.94      0.92     13471
weighted avg       0.95      0.94      0.95     13471

[[10198   587]
 [  161  2525]]


In [30]:
pipeline.named_steps['grid_search'].best_estimator_

In [31]:
final_lgc = pipeline.named_steps['grid_search'].best_estimator_

final_lgc.fit(X_train, y_train)

y_pred = final_lgc.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

print("Accuracy: {:.2f}".format(((accuracy_score(y_test, y_pred)) * 100)) + "%")

              precision    recall  f1-score   support

           0       0.80      1.00      0.89     10785
           1       0.46      0.01      0.01      2686

    accuracy                           0.80     13471
   macro avg       0.63      0.50      0.45     13471
weighted avg       0.73      0.80      0.71     13471

[[10766    19]
 [ 2670    16]]
Accuracy: 80.04%


In [32]:
import pickle

pickle.dump(final_lgc, open("../modellen/lgc.sav", 'wb'))