In [1]:
from __future__ import division
import pandas as pd
import numpy as np
import lightgbm as lgb
import os

from hyperopt import tpe
from hyperopt import STATUS_OK
from hyperopt import Trials
from hyperopt import hp
from hyperopt import fmin

from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, precision_score, f1_score, average_precision_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier

from imblearn.over_sampling import ADASYN
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import plotly.graph_objs as go

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

%matplotlib inline

Using TensorFlow backend.


In [19]:
df = pd.read_pickle('cleaned_df.pkl')

In [None]:
df.head()

In [20]:
y = df['Target'] 
y1 = y.copy()
df.drop(columns=['Label', 'Target'], inplace=True) 

In [21]:
X = df
X, X_test, y, y_test = train_test_split(X, y, test_size=0.20, random_state=21, stratify = y)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=21, stratify = y)

## Normal Logistic Regression

In [22]:
class_labels = ['Benign', 'Brute Force', 'SQL Injection']
scaler = StandardScaler() 
X_train_scaled = scaler.fit_transform(X_train.values) 
X_val_scaled = scaler.transform(X_val.values)
X_test_scaled = scaler.transform(X_test.values)

In [None]:
logistic = LogisticRegression(solver='lbfgs', multi_class='ovr', class_weight='balanced', max_iter=10000, n_jobs=-1)
logistic.fit(X_train_scaled,y_train)
y_pred = logistic.predict(X_val_scaled)
print(classification_report(y_val, y_pred, target_names=class_labels))

In [23]:
logistic = LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=1000, n_jobs=-1)
logistic.fit(X_train_scaled,y_train)
y_pred = logistic.predict(X_val_scaled)
print(classification_report(y_val, y_pred, target_names=class_labels))

               precision    recall  f1-score   support

       Benign       1.00      1.00      1.00    168386
  Brute Force       0.95      0.97      0.96        76
SQL Injection       0.86      0.50      0.63        12

     accuracy                           1.00    168474
    macro avg       0.94      0.82      0.86    168474
 weighted avg       1.00      1.00      1.00    168474



In [24]:
y_test_pred = logistic.predict(X_test_scaled)
print(classification_report(y_test, y_test_pred, target_names=class_labels))

               precision    recall  f1-score   support

       Benign       1.00      1.00      1.00    168386
  Brute Force       0.96      0.91      0.93        77
SQL Injection       0.75      0.55      0.63        11

     accuracy                           1.00    168474
    macro avg       0.90      0.82      0.85    168474
 weighted avg       1.00      1.00      1.00    168474



In [27]:
from sklearn.linear_model import LogisticRegressionCV

kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=27)
clf = LogisticRegressionCV(cv=kfolds.split(X_scaled, y), random_state=0,
                           multi_class='ovr', max_iter=1000, n_jobs=-1).fit(X_scaled, y)


lbfgs failed to converge. Increase the number of iterations.


lbfgs failed to converge. Increase the number of iterations.



In [29]:
X_train_scaled.shape

(505422, 56)

In [28]:
y_test_pred = clf.predict(X_test_scaled)
print(classification_report(y_test, y_test_pred, target_names=class_labels))

               precision    recall  f1-score   support

       Benign       1.00      1.00      1.00    168386
  Brute Force       0.95      0.95      0.95        77
SQL Injection       0.70      0.64      0.67        11

     accuracy                           1.00    168474
    macro avg       0.88      0.86      0.87    168474
 weighted avg       1.00      1.00      1.00    168474



## GridSearchCV for Logistic Regression Optimization

In [25]:
std = StandardScaler() 
X_scaled = std.fit_transform(X.values)
X_cv_test = std.transform(X_test.values)
kfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=27)

In [None]:
param_grid = {'C': [0.01, 1, 5, 10, 20, 50],
              'solver': ['lbfgs'],
              'multi_class' : ['ovr', 'multinomial'],
              'class_weight' : [None, 'balanced'],
             }

logistic = LogisticRegression(max_iter=1000, n_jobs=-1)
clf = GridSearchCV(logistic, param_grid, cv=kfolds.split(X_scaled, y), scoring='f1_macro')
best_model = clf.fit(X_scaled, y)
print('Best Solver:', best_model.best_estimator_.get_params()['solver'])
print('Best C:', best_model.best_estimator_.get_params()['C'])
print('Best multi-option:', best_model.best_estimator_.get_params()['multi_class'])
print('Best class weights:', best_model.best_estimator_.get_params()['class_weight'])

In [None]:
param_grid = {'C': [0.01, 0.01, 1, 5, 10, 20],
              'solver': ['lbfgs', 'saga'],
              'multi_class' : ['ovr', 'multinomial'],
              'class_weight' : [None, 'balanced'],
             }

logistic = LogisticRegression(max_iter=1000, n_jobs=-1)
clf = GridSearchCV(logistic, param_grid, cv=kfolds, scoring='f1_macro')
best_model = clf.fit(X_scaled, y)
print('Best Solver:', best_model.best_estimator_.get_params()['solver'])
print('Best C:', best_model.best_estimator_.get_params()['C'])
print('Best multi-option:', best_model.best_estimator_.get_params()['multi_class'])
print('Best class weights:', best_model.best_estimator_.get_params()['class_weight'])

In [None]:
X_train.shape

In [33]:
from sklearn.svm import LinearSVC
svm_model = LinearSVC(max_iter=100000)
svm_model.fit(X_train_scaled, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=100000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [34]:
y_svm_pred = svm_model.predict(X_val_scaled)
print(classification_report(y_val, y_svm_pred, target_names=class_labels))

               precision    recall  f1-score   support

       Benign       1.00      1.00      1.00    168386
  Brute Force       0.94      0.97      0.95        76
SQL Injection       0.75      0.50      0.60        12

     accuracy                           1.00    168474
    macro avg       0.90      0.82      0.85    168474
 weighted avg       1.00      1.00      1.00    168474

