In [None]:
from json import loads
LOC_TOKS_PATH = "../data/data_processed/loctoks.json"
TLD_TOKS_PATH = "../data/data_processed/tldtoks.json"
TEST_SET_PATH = "../data/data_processed/test.ldjson"
TRAIN_SET_PATH = "../data/data_processed/train.ldjson"

def load_toks(fname):
    tokens = None

    with open(fname, 'r') as file:
        tokens = loads(file.read())

    return tokens

def load_data(fname):
    dataset = []

    with open(fname, 'r') as file:
        line = file.readline()
        while line:
            dataset.append(loads(line))
            line = file.readline()
    
    return dataset

LOC_TOKS = load_toks(LOC_TOKS_PATH)
TLD_TOKS = load_toks(TLD_TOKS_PATH)

test_objs = load_data(TEST_SET_PATH)
train_objs = load_data(TRAIN_SET_PATH)
print(len(test_objs), len(train_objs))

In [None]:
test_objs

In [None]:
import pandas as pd
test_df = pd.DataFrame (test_objs)

In [None]:
print(test_df)

In [None]:
test_df.head()

In [None]:
train_df = pd.DataFrame (train_objs)

In [None]:
train_df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
test_cor = test_df.corr()
test_cor

In [None]:
plt.figure(figsize = (15,10))
sns.heatmap(data = test_cor, annot = True)

In [None]:
train_cor = train_df.corr()
train_cor
plt.figure(figsize = (15,10))
sns.heatmap(data = train_cor, annot = True)

In [None]:
import numpy as np
# Defining inputs
X = np.array([train_df['geo_loc'], train_df['param_len'], train_df['query_len'], train_df['frag_len'], train_df['tld'], train_df['who_is'], train_df['https']]).T
Y1 = np.array([train_df['label']]).T

In [None]:
print(X)
Y = np.ravel(Y1)
print(Y)

In [None]:
X.shape, Y.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier(max_features=5, n_estimators=100)


In [None]:
rf.fit(X, Y)

In [None]:
rf.score(X, Y)

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

max_features_range = np.arange(1,6,1)
n_estimators_range = np.arange(10,210,10)
param_grid = dict(max_features=max_features_range, n_estimators=n_estimators_range)

rf = RandomForestClassifier()

grid = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)

In [None]:
grid.fit(X, Y)                                                                                    

In [None]:
print("The best parameters are %s with a score of %0.2f"% (grid.best_params_, grid.best_score_))

In [None]:
import pandas as pd

grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
grid_results.head()

In [None]:
grid_contour = grid_results.groupby(['max_features','n_estimators']).mean()
grid_contour

In [None]:
grid_reset = grid_contour.reset_index()
grid_reset.columns = ['max_features', 'n_estimators', 'Accuracy']
grid_pivot = grid_reset.pivot('max_features', 'n_estimators')
grid_pivot

In [None]:
x = grid_pivot.columns.levels[1].values
y = grid_pivot.index.values
z = grid_pivot.values

In [None]:
import plotly.graph_objects as go

# X and Y axes labels
layout = go.Layout(
            xaxis=go.layout.XAxis(
              title=go.layout.xaxis.Title(
              text='n_estimators')
             ),
             yaxis=go.layout.YAxis(
              title=go.layout.yaxis.Title(
              text='max_features') 
            ) )

fig = go.Figure(data = [go.Contour(z=z, x=x, y=y)], layout=layout )

fig.update_layout(title='Hyperparameter tuning', autosize=False,
                  width=500, height=500,
                  margin=dict(l=65, r=50, b=65, t=90))

fig.show()

In [None]:
import plotly.graph_objects as go


fig = go.Figure(data= [go.Surface(z=z, y=y, x=x)], layout=layout )
fig.update_layout(title='Hyperparameter tuning',
                  scene = dict(
                    xaxis_title='n_estimators',
                    yaxis_title='max_features',
                    zaxis_title='Accuracy'),
                  autosize=False,
                  width=800, height=800,
                  margin=dict(l=65, r=50, b=65, t=90))
fig.show()


In [None]:
from sklearn.svm import SVC
model_svm = SVC(kernel = "rbf", gamma = 1.5)
model_svm.fit(X,Y)

Y_pred = model_svm.predict(X) 

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(Y, Y_pred)
#acc = accuracy_score(Y, Y_pred)
#F1 = f1_score(Y, Y_pred, average = "micro")
report = classification_report(Y, Y_pred)

In [None]:
cm

In [None]:
print(report)