# ACS6420 Advanced project
## Rolls-Royce Undergraduate project - Anomaly detection in discrete sequences
### Author: Aldair M Silva
### Supervisor: Visakan
Date created: 13/10/2021

In [13]:
from datetime import date
print('Updated: ', date.today().strftime('%d/%m/%Y'))

Updated:  06/03/2022


Import libraries

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import urllib.parse
from sklearn import tree
from sklearn import metrics
import io
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
import csv
from sklearn.manifold import TSNE
#import graphviz

Import data

In [16]:
normal_raw = 'normalTrafficTraining.txt'
anomaly_raw = 'anomalousTrafficTest.txt'

normal_parse = 'normalRequestTraining.txt'
anomaly_parse = 'anomalousRequestTest.txt'

## Prototype 1

File handling functions

In [17]:
# File parsing function
def parse_file(file_in, file_out):
    fin = open(file_in)
    fout = io.open(file_out, "w", encoding="utf-8")
    lines = fin.readlines()
    res = []
    
    for i in range(len(lines)):
        line = lines[i].strip()
        if line.startswith("GET"):
            res.append("GET" + line.split(" ")[1])
        elif line.startswith("POST") or line.startswith("PUT"):
            url = line.split(' ')[0] + line.split(' ')[1]
            j = 1
            while True:
                if lines[i + j].startswith("Content-Length"):
                    break
                j += 1
            j += 1
            data = lines[i + j + 1].strip()
            url += '?' + data
            res.append(url)
    for line in res:
        line = urllib.parse.unquote(line).replace('\n','').lower()
        fout.writelines(line + '\n')
    print ("finished parse ",len(res)," requests")
    fout.close()
    fin.close()

# File reading function
def loadData(file):
    with open(file, 'r', encoding="utf8") as f:
        data = f.readlines()
    result = []
    for d in data:
        d = d.strip()
        if (len(d) > 0):
            result.append(d)
    return result


In [18]:
normal_data = parse_file(normal_raw,normal_parse)
anomaly_data = parse_file(anomaly_raw,anomaly_parse)

finished parse  36000  requests
finished parse  25065  requests


In [20]:
good_requests = loadData('normalRequestTraining.txt')
bad_requests = loadData('anomalousRequestTest.txt')

all_requests = bad_requests + good_requests
yBad = [1] * len(bad_requests)
yGood = [0] * len(good_requests)
y = yBad + yGood

In [26]:
idx_random = np.random.randint(0,len(all_requests), 300)
short_all_request = [all_requests[i] for i in idx_random]
y_short = [y[i] for i in idx_random]
idx_random

array([43956, 49674, 56879, 14076, 56253, 20215, 41956, 23096, 43896,
       56680,  8667, 32828,  8827, 42118, 57386, 59131, 14355, 35667,
       53332, 48472, 49472, 33331, 43745, 57095, 57592, 44307, 48797,
       15157, 26480,  8471,  1344, 55303, 55749, 60511, 36955, 59963,
        3112, 54026, 54897, 35350, 30637,  6968, 10743, 15007, 53234,
       23157, 41533, 57889, 22126, 14585, 37854, 28708, 53524, 37507,
       40707,  9430, 40667, 38621, 12981,  6884, 23833, 53812, 58305,
       24239, 43911,  2094, 46252, 20471, 33929, 48946, 11722, 58970,
       49482, 21279, 11480, 48249, 42806, 33772, 17374, 16817, 20837,
       42011, 43579, 32473,  6118, 33703, 44815,  5948, 36473, 57167,
       41700, 48429, 55940, 41561,  5433, 32814, 14156, 23905, 42110,
       30262, 58904,  4047, 41917, 54231, 52819,  2480, 34007, 17577,
        2497,  4167, 14593, 40835, 43254, 53295, 40999, 25450,  1646,
       43824, 16297, 49775, 56457,  3253, 58144, 48693, 51603, 42187,
       45191,  1269,

In [30]:
len(short_all_request)

300

In [31]:
len(y_short)

300

### Doc2Vector

    This section uses TF-IDF (Term frequency-Inverse Document Frequency) to transform the words in the document into numberical values.
The values are determines by how many times they appear in the document, and the inverse frequency of the same word in other documents/corpus.
This process is called "vectorization", and is calculated as tf=log[1+fre(term,document)], idf=log(1+total_number_of_pages/number_of_pages_containing_term).
Both resutls are then multiplied. The values range from 0 to 1, 0 being very relevant and 1 being less relevant.
Relevance in this case means the word is unique (less common/less frequent) in and across documents, and the opposite for less relevant words (such as what/the/is/in for instance).

For this text analysis, the words will be taken as characters due to the nature of the document (web request), resulting in roughly 62 features extracted from 61065 requests)

In [33]:
vectorizer = TfidfVectorizer(min_df=0.0, analyzer="char", sublinear_tf=True, ngram_range=(1, 1))
X = vectorizer.fit_transform(all_requests)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
pd.DataFrame(X_train[:])
X.shape 

(61065, 62)

Feature analysis will show all the characters used as features. As it can be observed below, numbers, letters, special and unknwon characters are taken into account. 
The unknown features in this model exist because the vectorizer works in english language, and the document contains latin words/characters, although not very frequent.

In [35]:
# Analysis of features extracted

feature_array = np.array(vectorizer.get_feature_names_out())    # Get feature names
print("Feature names: ", feature_array)

X_dense = X.todense()


Feature names:  ['!' '"' '#' '$' '%' '&' "'" '(' ')' '*' '+' ',' '-' '.' '/' '0' '1' '2'
 '3' '4' '5' '6' '7' '8' '9' ':' ';' '<' '=' '>' '?' '@' '_' 'a' 'b' 'c'
 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' 'p' 'q' 'r' 's' 't' 'u'
 'v' 'w' 'x' 'y' 'z' '|' '~' '�']


(61065, 62)

In [36]:
np.where(X_dense[0,:])[1]

array([ 4,  5,  6,  9, 10, 13, 14, 15, 16, 17, 20, 23, 25, 26, 28, 30, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 50, 51,
       52, 53, 55, 61], dtype=int64)

In [None]:
X_dense[1,:]

matrix([[0.        , 0.        , 0.        , 0.        , 0.18109148,
         0.21037106, 0.30810222, 0.        , 0.        , 0.23645553,
         0.31071386, 0.        , 0.        , 0.0461847 , 0.11918346,
         0.0773328 , 0.0797863 , 0.09089871, 0.        , 0.        ,
         0.10564528, 0.        , 0.        , 0.09585201, 0.        ,
         0.0773328 , 0.35460594, 0.        , 0.18357973, 0.        ,
         0.07033681, 0.        , 0.        , 0.1662103 , 0.14664713,
         0.13455149, 0.14298912, 0.15519546, 0.09401223, 0.04991178,
         0.09585201, 0.160923  , 0.09007849, 0.14146085, 0.13455149,
         0.14274414, 0.12993501, 0.15916962, 0.11918346, 0.        ,
         0.19434165, 0.12751082, 0.15084227, 0.12484693, 0.        ,
         0.10393708, 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.25203181]])

In [None]:
# Features.csv gets created in the current working directory
with open('Features.csv', 'w', newline = '') as csvfile:
	my_writer = csv.writer(csvfile, delimiter = ' ')
	my_writer.writerow(X_array)


### PCA
Explore how components affect model dimensions.
Understand how relevant the method is to identify best features that can best pick anomalous data.
Check explained variance ratios, try using a treshold to separate best features.

In [43]:
pca = PCA(n_components=2)
X_PCA = pca.fit_transform(X_dense)
X_embedded = TSNE(n_components=2).fit_transform(X_dense)

#np.mean(transformer.components_ == 0)
#plt.
#plt.title("X PCA")
#plt.show()

#>>> pca = PCA(n_components=2, svd_solver='full')
#>>> pca.fit(X)
#PCA(n_components=2, svd_solver='full')
#>>> print(pca.explained_variance_ratio_)
#[0.9924... 0.00755...]
#>>> print(pca.singular_values_)
#[6.30061... 0.54980...]



PCA plots

In [44]:
# Scatter plot
plt.scatter(X_PCA[:,0], X_PCA[:,1], c=y_short)

ValueError: 'c' argument has 300 elements, which is inconsistent with 'x' and 'y' with size 61065.

In [None]:
# Linear plot
plt.scatter(X_embedded[:,0], X_embedded[:,1], c=y_short)

Logistic regression model

In [None]:
lgs = LogisticRegression()
lgs.fit(X_train, y_train)
y_pred = lgs.predict(X_test)
score_test = metrics.accuracy_score(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)

print ("Score Logistic Regression :",score_test)
print ("Confusion Matrix: ")
print (matrix)

Score Logistic Regression : 0.7995087336244542
Confusion Matrix: 
[[9742 1103]
 [2570 4905]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision tree model
Write on understanding of rules.
Plot decision tree.

In [None]:
dtc = tree.DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
score_test = metrics.accuracy_score(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)
print ("Score Decesion Tree :",score_test)
print ("Confusion Matrix: ")
print (matrix)


Score Decesion Tree : 0.9669759825327511
Confusion Matrix: 
[[10541   304]
 [  301  7174]]


In [None]:
# Plot decision trees
tree.plot_tree(dtc)

NameError: name 'dtc' is not defined

SVM model

In [None]:
linear_svm=LinearSVC(C=1)
linear_svm.fit(X_train, y_train)
y_pred = linear_svm.predict(X_test)
score_test = metrics.accuracy_score(y_test, y_pred)
matrix = confusion_matrix(y_test, y_pred)
print ("Score Linear SVM :",score_test)
print ("Confusion Matrix: ")
print (matrix)


Score Linear SVM : 0.8186681222707424
Confusion Matrix: 
[[9830 1015]
 [2307 5168]]


### Random forest model

Write on rules used for random forest models

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
score_test = metrics.accuracy_score(y_test, y_pred)
print ("Score Random Forest :",score_test)
print ("Confusion Matrix: ")
print (matrix)

Score Random Forest : 0.980349344978166
Confusion Matrix: 
[[9830 1015]
 [2307 5168]]


In [None]:
# Plot random forest