In [432]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from copy import deepcopy

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler


In [321]:
df_save = pd.read_csv('./data/dataset_lab1.csv', sep=',')
df = deepcopy(df_save)

In [322]:
df

Unnamed: 0,URL,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,SERVER,CONTENT_LENGTH,WHOIS_COUNTRY,WHOIS_STATEPRO,WHOIS_REGDATE,WHOIS_UPDATED_DATE,...,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,Type
0,M0_109,16,7,iso-8859-1,nginx,263.0,,,10/10/2015 18:21,,...,0,2,700,9,10,1153,832,9,2.0,1
1,B0_2314,16,6,UTF-8,Apache/2.4.10,15087.0,,,,,...,7,4,1230,17,19,1265,1230,17,0.0,0
2,B0_911,16,6,us-ascii,Microsoft-HTTPAPI/2.0,324.0,,,,,...,0,0,0,0,0,0,0,0,0.0,0
3,B0_113,17,6,ISO-8859-1,nginx,162.0,US,AK,7/10/1997 4:00,12/09/2013 0:45,...,22,3,3812,39,37,18784,4380,39,8.0,0
4,B0_403,17,6,UTF-8,,124140.0,US,TX,12/05/1996 0:00,11/04/2017 0:00,...,2,5,4278,61,62,129889,4586,61,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1776,M4_48,194,16,UTF-8,Apache,,ES,Barcelona,17/09/2008 0:00,2/09/2016 0:00,...,0,0,0,0,3,186,0,0,0.0,1
1777,M4_41,198,17,UTF-8,Apache,,ES,Barcelona,17/09/2008 0:00,2/09/2016 0:00,...,0,0,0,0,2,124,0,0,0.0,1
1778,B0_162,201,34,utf-8,Apache/2.2.16 (Debian),8904.0,US,FL,15/02/1999 0:00,15/07/2015 0:00,...,2,6,6631,87,89,132181,6945,87,4.0,0
1779,B0_1152,234,34,ISO-8859-1,cloudflare-nginx,,US,CA,1/04/1998 0:00,9/12/2016 0:00,...,0,0,0,0,0,0,0,0,0.0,0


In [323]:
df.drop('URL', axis=1, inplace=True)

In [324]:
df.CHARSET.unique()

array(['iso-8859-1', 'UTF-8', 'us-ascii', 'ISO-8859-1', 'utf-8', nan,
       'windows-1251', 'ISO-8859', 'windows-1252'], dtype=object)

In [325]:
# filtering CHARSET
df = df[df.CHARSET.notna()]
df.CHARSET = df.CHARSET.apply(lambda x: x.lower())
counts = df['CHARSET'].value_counts()
counts = counts[counts > 1]
df = df.query('CHARSET in @counts.keys()')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.CHARSET = df.CHARSET.apply(lambda x: x.lower())


In [326]:
# drop rows with no servers
df = df[df.SERVER.notna()]

In [327]:
df.drop('CONTENT_LENGTH', axis=1, inplace=True)

In [328]:
df.drop(['WHOIS_COUNTRY', 'WHOIS_STATEPRO'], axis=1, inplace=True)

In [329]:
def my_datetime(_dt: str):
    if _dt in [None, '0', 'b', np.nan]:
        return np.nan

    if _dt in ['2002-03-20T23:59:59.0Z', '2017-03-07T22:02:38.0Z']:
        return np.nan

    return datetime.strptime(_dt, '%d/%m/%Y %H:%M')


df['WHOIS_REGDATE'] = df['WHOIS_REGDATE'].apply(my_datetime)
df['WHOIS_UPDATED_DATE'] = df['WHOIS_UPDATED_DATE'].apply(my_datetime)

In [330]:
df.loc[df.WHOIS_REGDATE.isna(), "WHOIS_REGDATE"] = df.WHOIS_REGDATE.median()
df.loc[df.WHOIS_UPDATED_DATE.isna(), "WHOIS_UPDATED_DATE"] = df.WHOIS_UPDATED_DATE.median()

In [331]:
df = df[df.DNS_QUERY_TIMES.notna()]

In [332]:
df.WHOIS_REGDATE.value_counts()

WHOIS_REGDATE
2001-11-06 01:42:00    130
2008-09-17 00:00:00     62
2001-01-13 00:12:00     59
2000-07-31 00:00:00     47
2005-02-15 00:00:00     41
                      ... 
1999-08-16 00:00:00      1
2002-07-18 00:00:00      1
1994-11-23 00:00:00      1
2015-08-30 00:00:00      1
2008-11-14 00:00:00      1
Name: count, Length: 824, dtype: int64

In [333]:
servers = df.SERVER.value_counts()
servers = servers[servers > 10]
servers_list = tuple(servers.index.to_list())


def my_server(_server):
    return _server if _server in servers_list else 'other'

In [334]:
df.SERVER = df.SERVER.apply(my_server)

In [335]:
df

Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,SERVER,WHOIS_REGDATE,WHOIS_UPDATED_DATE,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,Type
0,16,7,iso-8859-1,nginx,2015-10-10 18:21:00,2016-08-16 00:00:00,7,0,2,700,9,10,1153,832,9,2.0,1
1,16,6,utf-8,other,2001-11-06 01:42:00,2016-08-16 00:00:00,17,7,4,1230,17,19,1265,1230,17,0.0,0
2,16,6,us-ascii,Microsoft-HTTPAPI/2.0,2001-11-06 01:42:00,2016-08-16 00:00:00,0,0,0,0,0,0,0,0,0,0.0,0
3,17,6,iso-8859-1,nginx,1997-10-07 04:00:00,2013-09-12 00:45:00,31,22,3,3812,39,37,18784,4380,39,8.0,0
5,18,7,utf-8,nginx,2016-08-03 14:30:00,2016-10-03 03:45:00,11,6,9,894,11,13,838,894,11,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1776,194,16,utf-8,Apache,2008-09-17 00:00:00,2016-09-02 00:00:00,0,0,0,0,0,3,186,0,0,0.0,1
1777,198,17,utf-8,Apache,2008-09-17 00:00:00,2016-09-02 00:00:00,0,0,0,0,0,2,124,0,0,0.0,1
1778,201,34,utf-8,other,1999-02-15 00:00:00,2015-07-15 00:00:00,83,2,6,6631,87,89,132181,6945,87,4.0,0
1779,234,34,iso-8859-1,cloudflare-nginx,1998-04-01 00:00:00,2016-12-09 00:00:00,0,0,0,0,0,0,0,0,0,0.0,0


In [336]:
sc = StandardScaler()
columns_to_standardize = [x for x in df.columns.tolist() if x not in ['CHARSET', 'SERVER', 'Type']]

df['WHOIS_REGDATE'] = df['WHOIS_REGDATE'].astype('int64')
df['WHOIS_UPDATED_DATE'] = df['WHOIS_UPDATED_DATE'].astype('int64')

scaled = sc.fit_transform(df[columns_to_standardize])
df[columns_to_standardize] = scaled

In [337]:
columns_to_dummy = ['CHARSET', 'SERVER']

In [338]:
for col in columns_to_dummy:
    encoded = pd.get_dummies(df[col])
    for enc_col in encoded.columns:
        df[f'{col}_{enc_col}'] = encoded[enc_col]
df.drop(columns_to_dummy, axis=1, inplace=True)

TRAIN

In [381]:
Y = df['Type'].to_numpy()
X = df.drop(columns=['Type']).to_numpy()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [434]:
clf = LogisticRegression(random_state=0)
params = {
    'solver':  ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
    'penalty' : ['l2', None],
    'max_iter': range(100, 150, 10),
    'tol' : [1e-4],
}
logreg_grid_search = GridSearchCV(clf, params, cv=3, n_jobs=-1, verbose=1, scoring=precision_score)
logreg_grid_search.fit(X_train, Y_train)

best_found_clf = LogisticRegression(random_state=0, solver='liblinear', penalty='l1')
best_found_clf.fit(X_train, Y_train)

print(logreg_grid_search.best_params_)
print(logreg_grid_search.best_estimator_)
print(logreg_grid_search.best_estimator_.score(X_test, Y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits
{'max_iter': 100, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.0001}
LogisticRegression(random_state=0)
0.9184952978056427


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


In [435]:

models = [logreg_grid_search.best_estimator_,
          best_found_clf]
for model in models:
    print(model)
    
    print(f'\taccuracy: {accuracy_score(model.predict(X_test), Y_test)}')
    print(f'\tprecision: {precision_score(model.predict(X_test), Y_test)}')
    print(f'\trecall: {recall_score(model.predict(X_test), Y_test)}')
    print(f'\tf1: {f1_score(model.predict(X_test), Y_test)}')

LogisticRegression(random_state=0)
	accuracy: 0.9184952978056427
	precision: 0.43243243243243246
	recall: 0.7619047619047619
	f1: 0.5517241379310345
LogisticRegression(penalty='l1', random_state=0, solver='liblinear')
	accuracy: 0.9247648902821317
	precision: 0.4594594594594595
	recall: 0.8095238095238095
	f1: 0.5862068965517241
