In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_numeric_dtype
import warnings
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import BernoulliNB
from lightgbm import LGBMClassifier
from sklearn.feature_selection import RFE
import itertools
from xgboost import XGBClassifier
from tabulate import tabulate

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Traindata.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Testdata.csv')

In [None]:
train.info()

In [None]:
train.describe(include=['object'])

In [None]:
train.isnull().sum()

In [None]:
total = train.shape[0]
missing_colums = [col for col in train.columns if train[col].isnull().sum() > 0]
for col in missing_colums:
  null_count = train[col].isnull().sum()
  per = (null_count/total) * 100
  print(f"{col}: {null_count} ({round(per, 3)}%)")

In [None]:
  print(f"Number of duplicate rows: {train.duplicated().sum()}")

In [None]:
sns.countplot(x=train['class'])

In [None]:
print("Class distribution training set")
print(train['class'].value_counts())

In [None]:
def le(df):
  for col in df.columns:
    if df[col].dtype == 'object':
      label_encoder = LabelEncoder()
      df[col] = label_encoder.fit_transform(df[col])
le(train)
le(test)

In [None]:
train.drop(['num_outbound_cmds'], axis=1, inplace=True)
test.drop(['num_outbound_cmds'], axis=1, inplace=True)

In [None]:
train.head()

In [None]:
X_train = train.drop(['class'], axis=1)
Y_train = train['class']

In [None]:
rfc = RandomForestClassifier()
rfe = RFE(rfc, n_features_to_select=10)
rfe = rfe.fit(X_train, Y_train)
feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), X_train.columns)]
selected_features = [v for i, v in feature_map if i==True]
selected_features

In [None]:
X_train = X_train[selected_features]

In [None]:
scale = StandardScaler()
X_train = scale.fit_transform(X_train)
test = scale.fit_transform(test)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, train_size=0.70, random_state=2)

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
import time

In [None]:
from sklearn.linear_model import LogisticRegression

clfl = LogisticRegression(max_iter=1200000)
start_time = time.time()
clfl.fit(x_train, y_train.values.ravel())
end_time = time.time()
print("Training time:", end_time-start_time)

In [None]:
start_time = time.time()
y_test_pred = clfl.predict(x_train)
end_time = time.time()
print("Testing time:", end_time-start_time)

In [None]:
lg_model = LogisticRegression(random_state=42)
lg_model.fit(x_train, y_train)

In [None]:
lg_train, lg_test = lg_model.score(x_train, y_train), lg_model.score(x_test, y_test)
print(f"Training score: {lg_train}")
print(f"Test score: {lg_test}")

In [None]:
pip install optuna

In [None]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
def objective(trial):
  n_neighbors = trial.suggest_int("KNN_n_neighbors", 2, 16, log=False)
  classifier_obj = KNeighborsClassifier(n_neighbors=n_neighbors)
  classifier_obj.fit(x_train, y_train)
  accuracy = classifier_obj.score(x_test, y_test)
  return accuracy

In [None]:
study_KNN = optuna.create_study(direction='maximize')
study_KNN.optimize(objective, n_trials=1)
print(study_KNN.best_trial)

In [None]:
KNN_model = KNeighborsClassifier(n_neighbors=study_KNN.best_trial.params['KNN_n_neighbors'])
KNN_model.fit(x_train, y_train)
KNN_train, KNN_test = KNN_model.score(x_train, y_train), KNN_model.score(x_test, y_test)
print(f"Train Score: {KNN_train}")
print(f"Test Score: {KNN_test}")

In [None]:
from sklearn.tree import DecisionTreeClassifier
clfd = DecisionTreeClassifier(criterion ="entropy", max_depth = 4)
start_time = time.time()
clfd.fit(x_train, y_train.values.ravel())
end_time = time.time()
print("Training time: ", end_time-start_time)

In [None]:
start_time = time.time()
y_test_pred = clfd.predict(x_train)
end_time = time.time()
print("Testing time: ", end_time-start_time)

In [None]:
def objective(trial):
  dt_max_depth = trial.suggest_int('dt_max_depth', 2, 32, log=False)
  dt_max_features = trial.suggest_int('dt_max_features', 2, 10, log=False)
  classifier_obj =  DecisionTreeClassifier(max_features = dt_max_features, max_depth=dt_max_depth)
  classifier_obj.fit(x_train, y_train)
  accuracy = classifier_obj.score(x_test, y_test)
  return accuracy

In [None]:
study_dt = optuna.create_study(direction='maximize')
study_dt.optimize(objective, n_trials=30)
print(study_dt.best_trial)

In [None]:
dt = DecisionTreeClassifier(max_features = study_dt.best_trial.params['dt_max_features'], max_depth = study_dt.best_trial.params['dt_max_depth'])
dt.fit(x_train, y_train)
dt_train, dt_test = dt.score(x_train, y_train), dt.score(x_test, y_test)
print(f"Train Score: {dt_train}")
print(f"Test Score: {dt_test}")

In [None]:
data = [["KNN", KNN_train, KNN_test],
        ["Logistic Regression", lg_train, lg_test],
        ["Decision Tree", dt_train, dt_test]]
col_names = ["Model", "Train Score", "Test Score"]
print(tabulate(data, headers=col_names, tablefmt="fancy_grid"))

In [None]:
SEED = 42
dtc = DecisionTreeClassifier()
knn = KNeighborsClassifier()
lr = LogisticRegression()

In [None]:
from sklearn.model_selection import cross_val_score
models = {}
models['KNeighborsClassifier'] = knn
models['LogisticRegression'] = lr
models['DecisionTreeClassifier'] = dtc

In [None]:
scores = {}
for name in models:
  scores[name] = {}
  for scorer in ['precision','recall']:
    scores[name][scorer] = cross_val_score(models[name], x_train, y_train, cv=10, scoring=scorer)

In [None]:
def line(name):
  return '*'*(25-len(name)//2)

for name in models:
  print(line(name), name, 'Model Validation', line (name))

  for scorer in ['precision', 'recall']:
    mean = round(np.mean(scores[name][scorer])*100,2)
    stdev = round (np.std(scores[name][scorer])*100,2)
    print ("Mean {}:".format(scorer),"\n", mean,"%","+-",stdev)
    print()

In [None]:
for name in models:
    for scorer in ['precision', 'recall']:
        scores[name][scorer] = scores[name][scorer].mean()
scores = pd.DataFrame(scores).swapaxes("index", "columns")*100
scores.plot(kind = "bar", ylim=[80,100], figsize=(24,6), rot=0)

In [None]:
models = {}
models['KNeighborsClassifier'] = knn
models['LogisticRegression'] = lr
models['DecisionTreeClassifier'] = dtc

In [None]:
preds = {}
for name in models:
    models[name].fit(x_train, y_train)
    preds[name] = models[name].predict(x_test)
print("Predictions complete.")

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, f1_score
def line(name,sym="*"):
    return sym*(25-len(name)//2)
target_names = ["normal", "anamoly"]
for name in models:
    print(line(name), name, 'Model Testing', line(name))
    print(confusion_matrix(y_test, preds[name]))
    print(line(name,'-'))
    print(classification_report(y_test, preds[name], target_names=target_names))

In [None]:
f1s = {}
for name in models:
    f1s[name] = f1_score(y_test, preds [name])
f1s = pd.DataFrame(f1s.values(),index=f1s.keys(), columns=["F1-score"])*100
f1s.plot(kind = "bar", ylim=[80,100], figsize=(10,6), rot=0)