In [41]:
import pandas as pd, numpy as np
import regex as re

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn import preprocessing, metrics, tree

#global options
pd.set_option('display.max_columns', None)

## Read in data

In [42]:


'''
Outputs an array of dictionaries, with each dictionary representing a request.
@param data_array an array containing each line of the txt file as an item
'''
def process_http_data(data_array, is_anomalous):
    data_processed = []
    i = 0 
    while i < (len(data_array)-1):
        data_item = {}
        if data_array[i][0:3] == 'GET':
            data_item['method']='GET'
            data_item['url']= data_array[i][4:-1]
            i+=1

            while (i<len(data_array)-1) & (data_array[i]!='\n') :
                s = data_array[i].split(':',1)
                data_item[ s[0] ] = s[1][1:-1]
                i += 1
            i +=2

        elif (data_array[i][0:4] == 'POST') | (data_array[i][0:3] == 'PUT'):
            if (data_array[i][0:3] == 'PUT'):
                data_item['method']='PUT'
            else:
                data_item['method']='POST'
            data_item['url']= data_array[i][5:-1]
            i+=1

            while data_array[i]!='\n':
                s = data_array[i].split(':',1)
                data_item[s[0] ] = s[1][1:-1]
                i += 1

            i += 1
            data_item['body'] = data_array[i][0:-1]
            i+= 2

        else:
            i+=1
            continue

        data_item['anomalous'] = is_anomalous
        data_processed.append(data_item)

    return data_processed



In [43]:
with open('../data/raw/normalTrafficTest.txt') as file:
    data1 =  file.readlines()

with open('../data/raw/normalTrafficTraining.txt') as file:
    data2 = file.readlines()

with open('../data/raw/anomalousTrafficTest.txt') as file:
    data3 = file.readlines()

d3 = process_http_data(data3, True)
d2 = process_http_data(data2, False)
d1 = process_http_data(data1, False)

d3[-1]

{'method': 'GET',
 'url': 'http://localhost:8080/tienda1/imagenes/nuestratierra.jpg.BAK HTTP/1.1',
 'User-Agent': 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.8 (like Gecko)',
 'Pragma': 'no-cache',
 'Cache-control': 'no-cache',
 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
 'Accept-Encoding': 'x-gzip, x-deflate, gzip, deflate',
 'Accept-Charset': 'utf-8, utf-8;q=0.5, *;q=0.5',
 'Accept-Language': 'en',
 'Host': 'localhost:8080',
 'Cookie': 'JSESSIONID=3CC12010CDA952F123240EBAD79B55CC',
 'Connection': 'close',
 'anomalous': True}

In [44]:
df = pd.DataFrame(d1 + d2 + d3)
del(d3,d2,d1)
df.head()

Unnamed: 0,method,url,User-Agent,Pragma,Cache-control,Accept,Accept-Encoding,Accept-Charset,Accept-Language,Host,Cookie,Connection,anomalous,Content-Type,Content-Length,body
0,GET,http://localhost:8080/tienda1/index.jsp HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=EA414B3E327DED6875848530C864BD8F,close,False,,,
1,GET,http://localhost:8080/tienda1/publico/anadir.j...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=54E25FF4B7F0E4E855B112F882E9EEA5,close,False,,,
2,POST,http://localhost:8080/tienda1/publico/anadir.j...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=788887A0F479749C4CEEA1E268B4A501,close,False,application/x-www-form-urlencoded,74.0,id=1&nombre=Jam%F3n+Ib%E9rico&precio=39&cantid...
3,GET,http://localhost:8080/tienda1/publico/autentic...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=94ECD5EE8EF7EFE4BB26C701B150ED7B,close,False,,,
4,POST,http://localhost:8080/tienda1/publico/autentic...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=23391DBBADEC19FE01E02D201F278C6A,close,False,application/x-www-form-urlencoded,60.0,modo=entrar&login=caria&pwd=egipciaca&remember...


## Process data / feature extraction

In [45]:
# process columns, extract features
df['browser'] = df['User-Agent'].str.extract( r'^(.*?) \(', expand=False)
df['system-information'] = df['User-Agent'].str.extract( r'\((.*?)\)', expand=False)
df['platform'] = df['User-Agent'].str.extract( r'\) (.*)$', expand=False)
df.drop('User-Agent',1)


df.head()

Unnamed: 0,method,url,User-Agent,Pragma,Cache-control,Accept,Accept-Encoding,Accept-Charset,Accept-Language,Host,Cookie,Connection,anomalous,Content-Type,Content-Length,body,browser,system-information,platform
0,GET,http://localhost:8080/tienda1/index.jsp HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=EA414B3E327DED6875848530C864BD8F,close,False,,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko)
1,GET,http://localhost:8080/tienda1/publico/anadir.j...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=54E25FF4B7F0E4E855B112F882E9EEA5,close,False,,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko)
2,POST,http://localhost:8080/tienda1/publico/anadir.j...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=788887A0F479749C4CEEA1E268B4A501,close,False,application/x-www-form-urlencoded,74.0,id=1&nombre=Jam%F3n+Ib%E9rico&precio=39&cantid...,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko)
3,GET,http://localhost:8080/tienda1/publico/autentic...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=94ECD5EE8EF7EFE4BB26C701B150ED7B,close,False,,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko)
4,POST,http://localhost:8080/tienda1/publico/autentic...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=23391DBBADEC19FE01E02D201F278C6A,close,False,application/x-www-form-urlencoded,60.0,modo=entrar&login=caria&pwd=egipciaca&remember...,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko)


In [46]:
#testing how to split
tmp = re.split('[?&]', 'http://localhost:8080/tienda1/publico/anadir.jsp?id=1&nombre=Jam%F3n+Ib%E9rico&precio=39&cantidad=41&B1=A%F1adir+al+carrito HTTP/1.1')
print(tmp)
tmp = re.split('[?/&=+]', 'http://localhost:8080/tienda1/publico/anadir.jsp?id=1&nombre=Jam%F3n+Ib%E9rico&precio=39&cantidad=41&B1=A%F1adir+al+carrito HTTP/1.1')
print(tmp)

['http://localhost:8080/tienda1/publico/anadir.jsp', 'id=1', 'nombre=Jam%F3n+Ib%E9rico', 'precio=39', 'cantidad=41', 'B1=A%F1adir+al+carrito HTTP/1.1']
['http:', '', 'localhost:8080', 'tienda1', 'publico', 'anadir.jsp', 'id', '1', 'nombre', 'Jam%F3n', 'Ib%E9rico', 'precio', '39', 'cantidad', '41', 'B1', 'A%F1adir', 'al', 'carrito HTTP', '1.1']


In [47]:
df['protocol'] = df['url'].str.extract(r' (.*?)$')
df['protocol'].value_counts()

HTTP/1.1    97065
Name: protocol, dtype: int64

In [52]:
def extract_url_only(x):
    if '?' in x:
        return x.split('?')[0]
    return x[:-8]

def extract_args_only(row):
    if row['method'] == 'GET':
        x = row['url']
        if '?' in x:
            return re.split( '[&=]', x.split('?')[1] )
        return []    
    elif (row['method'] == 'POST') | (row['method'] == 'PUT'):
        x = row['body']
        if type(x)==str:
            return re.split('[&=]', x)
        return []        
    

In [53]:
COMMON_SQL_WORDS = ['create', 'insert', 'view', 'from' , 'select', 'alter', 'add', 'distinct', 'into','update','set','delete',
                    'truncate','as','order','between','where','and','or','null','drop','column','table','database','group',
                    'having','join','union','exists','like','case']

COMMON_BATCH_WORDS = ["cd", "ls", "cat", "cd", "sudo", "tail", "echo", "grep", "mv", "less","more","gnome-open",
                     "chmod","chown","chgrp","find", "wget","curl", "su"]

'''
returns the number of words in x that are in words_list
'''
def num_in_words(x, words_list):
    count = 0
    w = x.lower()
    for w in COMMON_SQL_WORDS:
        if w in x:
            count +=1
    return count

def num_special_chars(x):
    y = ''.join(x)
    z = re.sub(r'[\w]+','',y)
    return len(z)

In [54]:
df['url_only'] = df['url'].apply(lambda x: extract_url_only(x))
df['url_words'] = df['url'].apply(lambda x:  re.split('[/]', x) )
df['url_words'][0]

df['arg_words'] = df.apply(lambda x: extract_args_only(x), axis=1)
df['arg_words'][0:2]

0                                                   []
1    [id, 1, nombre, Jam%F3n+Ib%E9rico, precio, 39,...
Name: arg_words, dtype: object

In [55]:
## getting more feature characteristics
df['num_of_args'] = df['arg_words'].apply(lambda x: len(x))
df['max_length_of_args'] = df['arg_words'].apply(lambda x: 0 if len(x) ==0 else max([ len(i) for i in x ] ))
df['min_length_of_args'] = df['arg_words'].apply(lambda x: 0 if len(x) ==0 else min([ len(i) for i in x ] ))
df['total_length_args'] = df['arg_words'].apply(lambda x: sum( [ len(i) for i in x ] ))

df['total_length_request'] = df['url'].apply(lambda x: len(x))
df['lenght_of_path'] = df['url_only'].apply(lambda x: len(x))
df['port_is_common'] = df['Host'].apply(lambda x: x.split(':')[-1] in ['80','443','8080'] )

df['num_of_paths'] = df['url_words'].apply(lambda x: len(x) )
df['num_sql_words'] = df['url'].apply(lambda x: num_in_words(x, COMMON_SQL_WORDS))
df['num_batch_words'] = df['url'].apply(lambda x: num_in_words(x, COMMON_BATCH_WORDS))
df['num_special_chars'] = df['arg_words'].apply(lambda x: num_special_chars(x))


In [56]:
df.head()

Unnamed: 0,method,url,User-Agent,Pragma,Cache-control,Accept,Accept-Encoding,Accept-Charset,Accept-Language,Host,Cookie,Connection,anomalous,Content-Type,Content-Length,body,browser,system-information,platform,protocol,url_only,url_words,arg_words,num_of_args,max_length_of_args,min_length_of_args,total_length_args,total_length_request,lenght_of_path,port_is_common,num_of_paths,num_sql_words,num_batch_words,num_special_chars
0,GET,http://localhost:8080/tienda1/index.jsp HTTP/1.1,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=EA414B3E327DED6875848530C864BD8F,close,False,,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko),HTTP/1.1,http://localhost:8080/tienda1/index.jsp,"[http:, , localhost:8080, tienda1, index.jsp H...",[],0,0,0,0,48,40,True,6,0,0,0
1,GET,http://localhost:8080/tienda1/publico/anadir.j...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=54E25FF4B7F0E4E855B112F882E9EEA5,close,False,,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko),HTTP/1.1,http://localhost:8080/tienda1/publico/anadir.jsp,"[http:, , localhost:8080, tienda1, publico, an...","[id, 1, nombre, Jam%F3n+Ib%E9rico, precio, 39,...",10,28,1,74,132,48,True,7,0,0,9
2,POST,http://localhost:8080/tienda1/publico/anadir.j...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=788887A0F479749C4CEEA1E268B4A501,close,False,application/x-www-form-urlencoded,74.0,id=1&nombre=Jam%F3n+Ib%E9rico&precio=39&cantid...,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko),HTTP/1.1,http://localhost:8080/tienda1/publico/anadir.jsp,"[http:, , localhost:8080, tienda1, publico, an...","[id, 1, nombre, Jam%F3n+Ib%E9rico, precio, 39,...",10,19,1,65,57,49,True,7,0,0,6
3,GET,http://localhost:8080/tienda1/publico/autentic...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=94ECD5EE8EF7EFE4BB26C701B150ED7B,close,False,,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko),HTTP/1.1,http://localhost:8080/tienda1/publico/autentic...,"[http:, , localhost:8080, tienda1, publico, au...","[modo, entrar, login, caria, pwd, egipciaca, r...",10,15,2,60,122,52,True,7,0,0,3
4,POST,http://localhost:8080/tienda1/publico/autentic...,Mozilla/5.0 (compatible; Konqueror/3.5; Linux)...,no-cache,no-cache,"text/xml,application/xml,application/xhtml+xml...","x-gzip, x-deflate, gzip, deflate","utf-8, utf-8;q=0.5, *;q=0.5",en,localhost:8080,JSESSIONID=23391DBBADEC19FE01E02D201F278C6A,close,False,application/x-www-form-urlencoded,60.0,modo=entrar&login=caria&pwd=egipciaca&remember...,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko),HTTP/1.1,http://localhost:8080/tienda1/publico/autentic...,"[http:, , localhost:8080, tienda1, publico, au...","[modo, entrar, login, caria, pwd, egipciaca, r...",10,9,2,51,61,53,True,7,0,0,0


In [57]:
#drop unused cells
df = df.drop(['body','url_only','Accept','Pragma','Cache-control', 'url', 'User-Agent','Cookie','Accept-Language', 'Accept-Encoding', 'Accept-Charset','Connection'], 1)
df.head(1)

Unnamed: 0,method,Host,anomalous,Content-Type,Content-Length,browser,system-information,platform,protocol,url_words,arg_words,num_of_args,max_length_of_args,min_length_of_args,total_length_args,total_length_request,lenght_of_path,port_is_common,num_of_paths,num_sql_words,num_batch_words,num_special_chars
0,GET,localhost:8080,False,,,Mozilla/5.0,compatible; Konqueror/3.5; Linux,KHTML/3.5.8 (like Gecko),HTTP/1.1,"[http:, , localhost:8080, tienda1, index.jsp H...",[],0,0,0,0,48,40,True,6,0,0,0


In [60]:
# normalize
#TODO
#     x = df[i] #returns a numpy array
#     min_max_scaler = preprocessing.MinMaxScaler()
#     x_scaled = min_max_scaler.fit_transform(x)
#     df[i] = pd.DataFrame(x_scaled)
    
#one hot encoding
to_encode_one_hot = ['method', 'Host','Content-Type', 'browser', 'platform','protocol', 'system-information']
enc = preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore')
enc.fit(df[to_encode_one_hot] )
encoded = pd.DataFrame( enc.transform( df[to_encode_one_hot] ), columns=enc.get_feature_names() )
df = pd.concat ([df, encoded] , axis=1)

#fill nulls
df['Content-Length'] = df['Content-Length'].fillna(0)

df = df.drop(to_encode_one_hot, axis=1)
df.head(1)

Unnamed: 0,anomalous,Content-Length,url_words,arg_words,num_of_args,max_length_of_args,min_length_of_args,total_length_args,total_length_request,lenght_of_path,port_is_common,num_of_paths,num_sql_words,num_batch_words,num_special_chars,x0_GET,x0_POST,x0_PUT,x1_localhost:8080,x1_localhost:9090,x2_application/x-www-form-urlencoded,x2_nan,x3_Mozilla/5.0,x4_KHTML/3.5.8 (like Gecko),x5_HTTP/1.1,x6_compatible; Konqueror/3.5; Linux
0,False,0,"[http:, , localhost:8080, tienda1, index.jsp H...",[],0,0,0,0,48,40,True,6,0,0,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0


In [61]:
# vecotirzation

#tokenize and tag the card text
card_docs = [TaggedDocument(words, [i]) 
             for i, words in enumerate(df.arg_words)]
card_docs[0:2]

[TaggedDocument(words=[], tags=[0]),
 TaggedDocument(words=['id', '1', 'nombre', 'Jam%F3n+Ib%E9rico', 'precio', '39', 'cantidad', '41', 'B1', 'A%F1adir+al+carrito HTTP/1.1'], tags=[1])]

In [62]:
#create model
model = Doc2Vec(vector_size=64, min_count=1, epochs = 20)
model.build_vocab(card_docs)
model.train(card_docs, total_examples=model.corpus_count, epochs=model.epochs)


In [None]:
#generate vectors
card2vec = [model.infer_vector(df['arg_words'][i]) 
            for i in range(0,len(df['arg_words']))]
card2vec[0]

In [None]:
#add to df

#Create a list of lists
dtv= np.array(card2vec).tolist()
#set list to dataframe column
df['card2vec'] = dtv
df.head(2)

In [None]:
df = df.drop(['url_words', 'arg_words'],axis=1)
df.head(2)

In [None]:
df_vecs = pd.DataFrame( dtv )
df_vecs['anomalous'] = df['anomalous']
df_vecs.head()

In [None]:
df = df.drop(['card2vec'],1)

## Baseline Model - Decision Tree

In [None]:
df['anomalous'].value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('anomalous',1), df['anomalous'], 
                                                    test_size=0.2, random_state=42, shuffle = True)

#train model
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)


In [None]:
X_train

In [None]:
predicted = clf.predict( X_test)
#clf.score(X_test, y_test, sample_weight=None)
print( metrics.classification_report(y_test, predicted) )

In [None]:
metrics.confusion_matrix( y_test,predicted)

In [None]:
for name, importance in zip(X_train.columns, clf.feature_importances_):
    print(name, importance)

In [None]:
#same thing but with vector df_vecs

X_train, X_test, y_train, y_test = train_test_split(df_vecs.drop('anomalous',1), df_vecs['anomalous'], 
                                                    test_size=0.2, random_state=42, shuffle = True)

#train model
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

predicted = clf.predict( X_test)
print( metrics.classification_report(y_test, predicted) )

In [None]:
#TODO
#check different epochs/num of features for vectors