In [1]:
import pandas as pd, numpy as np
import regex as re
import time
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn import preprocessing, metrics, tree
from sklearn.pipeline import make_pipeline
from sklearn.neural_network import MLPClassifier

#global options
pd.set_option('display.max_columns', None)

## Read in data

In [2]:


'''
Outputs an array of dictionaries, with each dictionary representing a request.
@param data_array an array containing each line of the txt file as an item
'''
def process_http_data(data_array, is_anomalous):
    data_processed = []
    i = 0 
    while i < (len(data_array)-1):
        data_item = {}
        if data_array[i][0:3] == 'GET':
            data_item['method']='GET'
            data_item['url']= data_array[i][4:-1]
            i+=1

            while (i<len(data_array)-1) & (data_array[i]!='\n') :
                s = data_array[i].split(':',1)
                data_item[ s[0] ] = s[1][1:-1]
                i += 1
            i +=2

        elif (data_array[i][0:4] == 'POST') | (data_array[i][0:3] == 'PUT'):
            if (data_array[i][0:3] == 'PUT'):
                data_item['method']='PUT'
            else:
                data_item['method']='POST'
            data_item['url']= data_array[i][5:-1]
            i+=1

            while data_array[i]!='\n':
                s = data_array[i].split(':',1)
                data_item[s[0] ] = s[1][1:-1]
                i += 1

            i += 1
            data_item['body'] = data_array[i][0:-1]
            i+= 2

        else:
            i+=1
            continue

        data_item['anomalous'] = is_anomalous
        data_processed.append(data_item)

    return data_processed



In [3]:
with open('../data/raw/normalTrafficTest.txt') as file:
    data1 =  file.readlines()

with open('../data/raw/normalTrafficTraining.txt') as file:
    data2 = file.readlines()

with open('../data/raw/anomalousTrafficTest.txt') as file:
    data3 = file.readlines()

d3 = process_http_data(data3, True)
d2 = process_http_data(data2, False)
d1 = process_http_data(data1, False)

d3[-1]

{'method': 'GET',
 'url': 'http://localhost:8080/tienda1/imagenes/nuestratierra.jpg.BAK HTTP/1.1',
 'User-Agent': 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko)',
 'Pragma': 'no-cache',
 'Cache-control': 'no-cache',
 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
 'Accept-Encoding': 'x-gzip, x-deflate, gzip, deflate',
 'Accept-Charset': 'utf-8, utf-8;q=0.5, *;q=0.5',
 'Accept-Language': 'en',
 'Host': 'localhost:8080',
 'Cookie': 'JSESSIONID=3CC12010CDA952F123240EBAD79B55CC',
 'Connection': 'close',
 'anomalous': True}

In [4]:
df_orig = pd.DataFrame(d1 + d2 + d3)
del(d3,d2,d1)
print(df_orig.shape)
df_orig.head()

(97065, 16)


Unnamed: 0,method,url,User-Agent,Pragma,Cache-control,Accept,Accept-Encoding,Accept-Charset,Accept-Language,Host,Cookie,Connection,anomalous,Content-Type,Content-Length,body
0,GET,http://localhost:8080/tienda1/index.jsp HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=EA414B3E327DED6875848530C864BD8F,close,False,,,
1,GET,http://localhost:8080/tienda1/publico/anadir.j...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=54E25FF4B7F0E4E855B112F882E9EEA5,close,False,,,
2,POST,http://localhost:8080/tienda1/publico/anadir.j...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=788887A0F479749C4CEEA1E268B4A501,close,False,application/x-www-form-urlencoded,74.0,id=1&nombre=Jam%F3n+Ib%E9rico&precio=39&cantid...
3,GET,http://localhost:8080/tienda1/publico/autentic...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=94ECD5EE8EF7EFE4BB26C701B150ED7B,close,False,,,
4,POST,http://localhost:8080/tienda1/publico/autentic...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=23391DBBADEC19FE01E02D201F278C6A,close,False,application/x-www-form-urlencoded,60.0,modo=entrar&login=caria&pwd=egipciaca&remember...


In [5]:
df_orig['method'].value_counts()

GET     71088
POST    25580
PUT       397
Name: method, dtype: int64

In [6]:
df= df_orig.copy()

In [7]:
#df.to_csv('../data/interim/1_original_data_to_df.csv')

## Process data / feature extraction

In [8]:
# process columns, extract features
df['browser'] = df['User-Agent'].str.extract( r'^(.*?) \(', expand=False)
df['system-information'] = df['User-Agent'].str.extract( r'\((.*?)\)', expand=False)
df['platform'] = df['User-Agent'].str.extract( r'\) (.*)$', expand=False)
df.drop('User-Agent',1)

print(df.shape)
df.head()

(97065, 19)


Unnamed: 0,method,url,User-Agent,Pragma,Cache-control,Accept,Accept-Encoding,Accept-Charset,Accept-Language,Host,Cookie,Connection,anomalous,Content-Type,Content-Length,body,browser,system-information,platform
0,GET,http://localhost:8080/tienda1/index.jsp HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=EA414B3E327DED6875848530C864BD8F,close,False,,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko)
1,GET,http://localhost:8080/tienda1/publico/anadir.j...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=54E25FF4B7F0E4E855B112F882E9EEA5,close,False,,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko)
2,POST,http://localhost:8080/tienda1/publico/anadir.j...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=788887A0F479749C4CEEA1E268B4A501,close,False,application/x-www-form-urlencoded,74.0,id=1&nombre=Jam%F3n+Ib%E9rico&precio=39&cantid...,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko)
3,GET,http://localhost:8080/tienda1/publico/autentic...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=94ECD5EE8EF7EFE4BB26C701B150ED7B,close,False,,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko)
4,POST,http://localhost:8080/tienda1/publico/autentic...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=23391DBBADEC19FE01E02D201F278C6A,close,False,application/x-www-form-urlencoded,60.0,modo=entrar&login=caria&pwd=egipciaca&remember...,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko)


In [9]:
#testing how to split
tmp = re.split('[?&]', 'http://localhost:8080/tienda1/publico/anadir.jsp?id=1&nombre=Jam%F3n+Ib%E9rico&precio=39&cantidad=41&B1=A%F1adir+al+carrito HTTP/1.1')
print(tmp)
tmp = re.split('[?/&=+]', 'http://localhost:8080/tienda1/publico/anadir.jsp?id=1&nombre=Jam%F3n+Ib%E9rico&precio=39&cantidad=41&B1=A%F1adir+al+carrito HTTP/1.1')
print(tmp)

['http://localhost:8080/tienda1/publico/anadir.jsp', 'id=1', 'nombre=Jam%F3n+Ib%E9rico', 'precio=39', 'cantidad=41', 'B1=A%F1adir+al+carrito HTTP/1.1']
['http:', '', 'localhost:8080', 'tienda1', 'publico', 'anadir.jsp', 'id', '1', 'nombre', 'Jam%F3n', 'Ib%E9rico', 'precio', '39', 'cantidad', '41', 'B1', 'A%F1adir', 'al', 'carrito HTTP', '1.1']


In [10]:
df['protocol'] = df['url'].str.extract(r' (.*?)$')
df['protocol'].value_counts()

HTTP/1.1    97065
Name: protocol, dtype: int64

In [11]:
def extract_url_only(x):
    if '?' in x:
        return x.split('?')[0]
    return x[:-8]

def extract_args_only(row):
    if row['method'] == 'GET':
        x = row['url']
        if '?' in x:
            return re.split( '[&=+/]', x.split('?')[1] )
        return []    
    elif (row['method'] == 'POST') | (row['method'] == 'PUT'):
        x = row['body']
        if type(x)==str:
            return re.split('[&=+/]', x)
        return []        
    

In [12]:
COMMON_SQL_WORDS = ['create', 'insert', 'view', 'from' , 'select', 'alter', 'add', 'distinct', 'into','update','set','delete',
                    'truncate','as','order','between','where','and','or','null','drop','column','table','database','group',
                    'having','join','union','exists','like','case']

COMMON_BATCH_WORDS = ["cd", "ls", "cat", "cd", "sudo", "tail", "echo", "grep", "mv", "less","more","gnome-open",
                     "chmod","chown","chgrp","find", "wget","curl", "su"]

'''
returns the number of words in x that are in words_list
'''
def num_in_words(x, words_list):
    count = 0
    w = x.lower()
    for w in words_list:
        if w in x:
            count +=1
    return count

def num_special_chars(x):
    y = ''.join(x)
    z = re.sub(r'[\w]+','',y)
    return len(z)

In [13]:
df['url_only'] = df['url'].apply(lambda x: extract_url_only(x))
df['url_words'] = df['url'].apply(lambda x:  re.split('[/.]', x) )
df['url_words'][0]

df['arg_words'] = df.apply( lambda x: extract_args_only(x), axis=1)
df['url_arg_words'] = df['arg_words'] + df['url_words']
df['arg_words'][0:2]

0                                                   []
1    [id, 1, nombre, Jam%F3n, Ib%E9rico, precio, 39...
Name: arg_words, dtype: object

In [14]:
df['url_arg_words'][0:10]

0    [http:, , localhost:8080, tienda1, index, jsp ...
1    [id, 1, nombre, Jam%F3n, Ib%E9rico, precio, 39...
2    [id, 1, nombre, Jam%F3n, Ib%E9rico, precio, 39...
3    [modo, entrar, login, caria, pwd, egipciaca, r...
4    [modo, entrar, login, caria, pwd, egipciaca, r...
5    [id, 2 HTTP, 1.1, http:, , localhost:8080, tie...
6    [id, 2, http:, , localhost:8080, tienda1, publ...
7    [http:, , localhost:8080, tienda1, publico, ca...
8    [errorMsg, Credenciales, incorrectas HTTP, 1.1...
9    [errorMsg, Credenciales, incorrectas, http:, ,...
Name: url_arg_words, dtype: object

In [15]:
## getting more feature characteristics
df['num_of_args'] = df['arg_words'].apply(lambda x: len(x))
df['max_length_of_args'] = df['arg_words'].apply(lambda x: 0 if len(x) ==0 else max([ len(i) for i in x ] ))
df['min_length_of_args'] = df['arg_words'].apply(lambda x: 0 if len(x) ==0 else min([ len(i) for i in x ] ))
df['total_length_args'] = df['arg_words'].apply(lambda x: sum( [ len(i) for i in x ] ))

df['total_length_request'] = df['url'].apply(lambda x: len(x))
df['lenght_of_path'] = df['url_only'].apply(lambda x: len(x))
df['port_is_common'] = df['Host'].apply(lambda x: x.split(':')[-1] in ['80','443','8080'] )

df['num_of_paths'] = df['url_words'].apply(lambda x: len(x) )
df['num_sql_words'] = df['url'].apply(lambda x: num_in_words(x, COMMON_SQL_WORDS))
df['num_batch_words'] = df['url'].apply(lambda x: num_in_words(x, COMMON_BATCH_WORDS))
df['num_special_chars'] = df['arg_words'].apply(lambda x: num_special_chars(x))


In [16]:
df.head(2)

Unnamed: 0,method,url,User-Agent,Pragma,Cache-control,Accept,Accept-Encoding,Accept-Charset,Accept-Language,Host,Cookie,Connection,anomalous,Content-Type,Content-Length,body,browser,system-information,platform,protocol,url_only,url_words,arg_words,url_arg_words,num_of_args,max_length_of_args,min_length_of_args,total_length_args,total_length_request,lenght_of_path,port_is_common,num_of_paths,num_sql_words,num_batch_words,num_special_chars
0,GET,http://localhost:8080/tienda1/index.jsp HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=EA414B3E327DED6875848530C864BD8F,close,False,,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko),HTTP/1.1,http://localhost:8080/tienda1/index.jsp,"[http:, , localhost:8080, tienda1, index, jsp ...",[],"[http:, , localhost:8080, tienda1, index, jsp ...",0,0,0,0,48,40,True,8,0,0,0
1,GET,http://localhost:8080/tienda1/publico/anadir.j...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=54E25FF4B7F0E4E855B112F882E9EEA5,close,False,,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko),HTTP/1.1,http://localhost:8080/tienda1/publico/anadir.jsp,"[http:, , localhost:8080, tienda1, publico, an...","[id, 1, nombre, Jam%F3n, Ib%E9rico, precio, 39...","[id, 1, nombre, Jam%F3n, Ib%E9rico, precio, 39...",14,12,1,70,132,48,True,9,0,0,5


In [17]:
#drop unused cells
df = df.drop(['body','url_only','Accept','Pragma','Cache-control', 'url', 'User-Agent','Cookie','Accept-Language', 'Accept-Encoding', 'Accept-Charset','Connection'], 1)
df.head(1)

Unnamed: 0,method,Host,anomalous,Content-Type,Content-Length,browser,system-information,platform,protocol,url_words,arg_words,url_arg_words,num_of_args,max_length_of_args,min_length_of_args,total_length_args,total_length_request,lenght_of_path,port_is_common,num_of_paths,num_sql_words,num_batch_words,num_special_chars
0,GET,localhost:8080,False,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko),HTTP/1.1,"[http:, , localhost:8080, tienda1, index, jsp ...",[],"[http:, , localhost:8080, tienda1, index, jsp ...",0,0,0,0,48,40,True,8,0,0,0


In [18]:
    
#one hot encoding
to_encode_one_hot = ['method', 'Host','Content-Type', 'browser', 'platform','protocol', 'system-information']
enc = preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')
enc.fit(df[to_encode_one_hot] )
encoded = pd.DataFrame( enc.transform( df[to_encode_one_hot] ), columns=enc.get_feature_names() )
df = pd.concat ([df, encoded] , axis=1)

#fill nulls
df['Content-Length'] = df['Content-Length'].fillna(0)
df['Content-Length'] = df['Content-Length'].astype(str).astype(int)

df = df.drop(to_encode_one_hot, axis=1)
df.head(5)



Unnamed: 0,anomalous,Content-Length,url_words,arg_words,url_arg_words,num_of_args,max_length_of_args,min_length_of_args,total_length_args,total_length_request,lenght_of_path,port_is_common,num_of_paths,num_sql_words,num_batch_words,num_special_chars,x0_GET,x0_POST,x0_PUT,x1_localhost:8080,x1_localhost:9090,x2_application/x-www-form-urlencoded,x2_nan,x3_Mozilla/5.0,x4_KHTML/3.5.8 (like Gecko),x5_HTTP/1.1,x6_compatible; Konqueror/3.5; Linux
0,False,0,"[http:, , localhost:8080, tienda1, index, jsp ...",[],"[http:, , localhost:8080, tienda1, index, jsp ...",0,0,0,0,48,40,True,8,0,0,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
1,False,0,"[http:, , localhost:8080, tienda1, publico, an...","[id, 1, nombre, Jam%F3n, Ib%E9rico, precio, 39...","[id, 1, nombre, Jam%F3n, Ib%E9rico, precio, 39...",14,12,1,70,132,48,True,9,0,0,5,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
2,False,74,"[http:, , localhost:8080, tienda1, publico, an...","[id, 1, nombre, Jam%F3n, Ib%E9rico, precio, 39...","[id, 1, nombre, Jam%F3n, Ib%E9rico, precio, 39...",13,9,1,62,57,49,True,9,0,0,3,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
3,False,0,"[http:, , localhost:8080, tienda1, publico, au...","[modo, entrar, login, caria, pwd, egipciaca, r...","[modo, entrar, login, caria, pwd, egipciaca, r...",11,11,2,59,122,52,True,9,0,0,2,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
4,False,60,"[http:, , localhost:8080, tienda1, publico, au...","[modo, entrar, login, caria, pwd, egipciaca, r...","[modo, entrar, login, caria, pwd, egipciaca, r...",10,9,2,51,61,53,True,9,0,0,0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0


In [19]:
#df.drop(['url_words', 'arg_words'],1).to_csv('../data/interim/2_extracted_features_without_word_vector.csv')

####  Vectorization

In [20]:
# doc2vec vecotirzation

#tokenize and tag the card text
card_docs = [TaggedDocument(words, [i]) 
             for i, words in enumerate(df.arg_words)]
card_docs[0:2]

[TaggedDocument(words=[], tags=[0]),
 TaggedDocument(words=['id', '1', 'nombre', 'Jam%F3n', 'Ib%E9rico', 'precio', '39', 'cantidad', '41', 'B1', 'A%F1adir', 'al', 'carrito HTTP', '1.1'], tags=[1])]

In [21]:
#create model
model = Doc2Vec(vector_size=90, min_count=1, epochs = 30)
model.build_vocab(card_docs)
model.train(card_docs, total_examples=model.corpus_count, epochs=model.epochs)


In [22]:
#generate vectors
card2vec = [model.infer_vector(df['arg_words'][i]) 
            for i in range(0,len(df['arg_words']))]
#card2vec[0]

In [23]:
#Create a list of lists
dtv= np.array(card2vec).tolist()

#set list to dataframe column
#df['card2vec'] = dtv
#df.head(2)

In [24]:
#create df out of vectors
df_vecs = pd.DataFrame( dtv )
df_vecs['anomalous'] = df['anomalous']
df_vecs.shape

(97065, 91)

In [25]:
#url vectorization omitted
#found to be not useful


In [26]:
df = df.drop(['url_words', 'arg_words', 'url_arg_words'],axis=1)

In [27]:
#df = df.drop(['card2vec'],1)

df_combined = df.drop(['anomalous'],1).join(df_vecs)
df_combined.head(1)

Unnamed: 0,Content-Length,num_of_args,max_length_of_args,min_length_of_args,total_length_args,total_length_request,lenght_of_path,port_is_common,num_of_paths,num_sql_words,num_batch_words,num_special_chars,x0_GET,x0_POST,x0_PUT,x1_localhost:8080,x1_localhost:9090,x2_application/x-www-form-urlencoded,x2_nan,x3_Mozilla/5.0,x4_KHTML/3.5.8 (like Gecko),x5_HTTP/1.1,x6_compatible; Konqueror/3.5; Linux,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,anomalous
0,0,0,0,0,0,48,40,True,8,0,0,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.000542,0.002391,0.001142,0.000499,-0.000848,0.001621,-0.000693,0.004353,0.005152,-0.001295,0.003241,0.000321,0.000756,0.004729,-0.004766,-0.004587,-0.005331,0.003696,0.003091,0.004111,0.005318,0.003324,-0.000428,0.003117,-0.004241,0.001555,-0.003963,0.004941,0.000243,-0.000948,-0.002616,0.003047,-0.000487,0.00076,-0.005347,0.001307,0.001246,0.001299,0.004931,0.00202,-0.001561,-0.0007,0.002196,-0.004886,0.001853,0.001896,-0.003218,-0.004123,-0.002051,-0.001514,0.00078,-0.000682,0.005426,-0.004422,-0.003235,-0.003763,0.001701,-0.002741,-0.000374,-0.00284,-0.003789,-0.004329,0.001737,-0.00402,-0.003371,-0.001459,0.003567,-0.004477,0.003755,-0.004488,0.005294,-0.000348,0.005297,0.001165,0.002658,-0.00512,-0.002413,-0.00422,-0.002265,-0.004236,-0.002022,-0.000953,-0.004843,0.002139,0.00074,-0.002607,0.000258,-0.004512,0.000844,0.00477,False


In [28]:
#df_combined.to_csv('../data/interim/3_extracted_features_with_word_vectors.csv')

# Models

In [29]:
df['anomalous'].value_counts()

False    72000
True     25065
Name: anomalous, dtype: int64

In [74]:
#without vectors (do not run the doc2vec section)
# df = df.drop(['url_words', 'arg_words'],axis=1)
# X_train, X_test, y_train, y_test = train_test_split(df.drop('anomalous',1), df['anomalous'], 
#                                                   test_size=0.3, random_state=42, shuffle = True)

#with vectors
X_train, X_test, y_train, y_test = train_test_split(df_combined.drop('anomalous',1), df['anomalous'], 
                                                    test_size=0.3, random_state=42, shuffle = True)
X_train.shape

(67945, 113)

In [75]:
y_train.value_counts()

False    50351
True     17594
Name: anomalous, dtype: int64

In [66]:
# from imblearn.over_sampling import RandomOverSampler
# from imblearn.under_sampling import RandomUnderSampler
# #oversampling the anomalous class

# # define oversampling strategy
# oversample = RandomOverSampler(sampling_strategy=0.9)
# X_train, y_train = oversample.fit_resample(X_train, y_train)
# print( X_train.shape)
# undersample = RandomUnderSampler(sampling_strategy=0.6)
# X_train, y_train = undersample.fit_resample(X_train, y_train)
# print(X_train.shape)



(95666, 113)


In [67]:
y_train.value_counts()

False    50351
True     45315
Name: anomalous, dtype: int64

##  Decision Tree

In [89]:
#train model
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

t0 = time.time()
predicted = clf.predict( X_test)
print(time.time()-t0)



0.4360013008117676


In [90]:
print("Test Set:")
print( metrics.classification_report(y_test, predicted) )

print("\nTraining Set:")
print( metrics.classification_report(y_train, clf.predict( X_train)) )

Test Set:
              precision    recall  f1-score   support

       False       0.96      0.96      0.96     21649
        True       0.90      0.88      0.89      7471

    accuracy                           0.94     29120
   macro avg       0.93      0.92      0.93     29120
weighted avg       0.94      0.94      0.94     29120


Training Set:




              precision    recall  f1-score   support

       False       1.00      1.00      1.00     50351
        True       1.00      0.99      1.00     17594

    accuracy                           1.00     67945
   macro avg       1.00      1.00      1.00     67945
weighted avg       1.00      1.00      1.00     67945



In [51]:
#print to csv
#df_predicted = pd.DataFrame(X_test[0:23])
#df_predicted['anomalous_actual'] = y_test
#df_predicted['anomalous_predicted'] = predicted
#df_predicted.to_csv('../data/interim/4_prediction_resuls.csv')

In [52]:
metrics.confusion_matrix( y_test,predicted)

array([[20786,   863],
       [  830,  6641]], dtype=int64)

In [53]:
#print important features
for name, importance in zip(X_train.columns, clf.feature_importances_):
    if importance>0.05:
        print(name, importance)

lenght_of_path 0.11001121133804102
num_of_paths 0.08934881890669333
num_special_chars 0.2046340716673305
27 0.08099663288937613


In [None]:
#same thing but with vector df_vecs only

X_train, X_test, y_train, y_test = train_test_split(df_vecs.drop('anomalous',1), df_vecs['anomalous'], 
                                                    test_size=0.3, random_state=42, shuffle = True)

#train model
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

predicted = clf.predict( X_test)
print( metrics.classification_report(y_test, predicted) )

In [None]:
#training set results
predicted = clf.predict( X_train)
print( metrics.classification_report(y_train, predicted) )

## Random Forest  

In [87]:

clf_rf = RandomForestClassifier(n_estimators=25, max_depth=20, random_state=0, warm_start=True, min_samples_leaf=2, 
                                 max_features='sqrt', min_samples_split=10)
clf_rf.fit(X_train, y_train)

t0 = time.time()
predicted = clf_rf.predict(X_test)
print(time.time()-t0)
print( metrics.classification_report(y_test, predicted) )



0.669001579284668
              precision    recall  f1-score   support

       False       0.96      0.99      0.97     21649
        True       0.95      0.87      0.91      7471

    accuracy                           0.96     29120
   macro avg       0.96      0.93      0.94     29120
weighted avg       0.96      0.96      0.96     29120



In [88]:
#print important features
for name, importance in zip(X_train.columns, clf_rf.feature_importances_):
    if importance>0.05:
        print(name, importance)

total_length_request 0.07074702259005912
lenght_of_path 0.09248637546279755
num_of_paths 0.06168499052897623
num_special_chars 0.06062689543734976


## SVM - Support Vector Machine

In [91]:
#for j in ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']:
clf_svm = make_pipeline(preprocessing.StandardScaler(), SVC(gamma=0.01,degree=3, kernel='rbf'))
clf_svm.fit(X_train, y_train)

t0 = time.time()
predicted = clf_svm.predict(X_test)
print(time.time()-t0)
print( metrics.classification_report(y_test, predicted) )



173.19999933242798
              precision    recall  f1-score   support

       False       0.95      0.99      0.97     21649
        True       0.97      0.84      0.90      7471

    accuracy                           0.95     29120
   macro avg       0.96      0.91      0.93     29120
weighted avg       0.95      0.95      0.95     29120



## Gradient Boost Tree

In [85]:

clf_gb = GradientBoostingClassifier(n_estimators=25, learning_rate=1.0,
                                    max_depth=20, random_state=0)
clf_gb.fit(X_train, y_train)

t0 = time.time()
predicted = clf_gb.predict(X_test)
print(time.time()-t0)
print( metrics.classification_report(y_test, predicted) )



0.7189981937408447
              precision    recall  f1-score   support

       False       0.97      0.99      0.98     21649
        True       0.97      0.92      0.94      7471

    accuracy                           0.97     29120
   macro avg       0.97      0.96      0.96     29120
weighted avg       0.97      0.97      0.97     29120



In [86]:
#print important features
for name, importance in zip(X_train.columns, clf_gb.feature_importances_):
    if importance>0.05:
        print(name, importance)

num_of_args 0.08290309903746867
lenght_of_path 0.10210966208302032
num_of_paths 0.07758322123400825
num_special_chars 0.20661802817204225
27 0.06868971467082338


## Neural net - MLP

In [83]:
scaler = preprocessing.StandardScaler() 
scaler.fit(X_train)  
X_train_scaled = scaler.transform(X_train) 
X_test_scaled = scaler.transform(X_test) 




In [84]:
clf_mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(14, 10 ), random_state=1, max_iter=300, warm_start=True)
clf_mlp.fit(X_train_scaled, y_train)

t0 = time.time()
predicted = clf_mlp.predict(X_test_scaled)
print(time.time()-t0)
print( metrics.classification_report(y_test, predicted) )                  

0.03499937057495117
              precision    recall  f1-score   support

       False       0.97      0.99      0.98     21649
        True       0.97      0.93      0.95      7471

    accuracy                           0.97     29120
   macro avg       0.97      0.96      0.96     29120
weighted avg       0.97      0.97      0.97     29120



## Ensemble Classifier

In [76]:
from sklearn.ensemble import StackingClassifier, VotingClassifier

In [80]:
clf1 = RandomForestClassifier(n_estimators=25, max_depth=20, random_state=0, warm_start=True, min_samples_leaf=2, 
                                 max_features='sqrt', min_samples_split=10)
clf2 = GradientBoostingClassifier(n_estimators=25, learning_rate=1.0,max_depth=20, random_state=0)
clf3 = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(14 , 10), random_state=1, max_iter=300, warm_start=True)

### Voting Classifier

In [81]:
ecl_voting = VotingClassifier(estimators=[('rf', clf1), ('grb', clf2), ('mlp', clf3)], weights=[1,1,1], voting='hard')
ecl_voting.fit(X_train_scaled, y_train)

t0 = time.time()
predicted = ecl_voting.predict(X_test_scaled)
print(time.time()-t0)
print( metrics.classification_report(y_test, predicted) )  

1.8698644638061523
              precision    recall  f1-score   support

       False       0.97      0.99      0.98     21649
        True       0.98      0.92      0.95      7471

    accuracy                           0.97     29120
   macro avg       0.97      0.96      0.97     29120
weighted avg       0.97      0.97      0.97     29120



### Stacking Classifier

In [82]:
ecl_stacking = StackingClassifier(estimators=[('rf', clf1), ('grb', clf2), ('mlp', clf3)])
ecl_stacking.fit(X_train_scaled, y_train)

t0 = time.time()
predicted = ecl_stacking.predict(X_test_scaled)
print(time.time()-t0)
print( metrics.classification_report(y_test, predicted) )  

0.7639992237091064
              precision    recall  f1-score   support

       False       0.98      0.99      0.98     21649
        True       0.97      0.94      0.96      7471

    accuracy                           0.98     29120
   macro avg       0.98      0.97      0.97     29120
weighted avg       0.98      0.98      0.98     29120



# Generate reports

In [None]:
from pandas_profiling import ProfileReport

In [None]:
# profile = ProfileReport(df_combined, minimal=True)
# profile.to_file(output_file="../reports/output_min.html")


In [None]:
# profile = ProfileReport(df)
# profile.to_file(output_file="../reports/output_no_doc2vec.html")

In [None]:
df.loc[df['x0_PUT']==1.0]['anomalous'].value_counts()