In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/torpeda_train_test/all.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,file,id,label,method,path,query,url
0,0,./torpeda_csv/anomalies/allAnomalies1.csv,0,anomalous,GET,/tienda,,GET /tienda
1,1,./torpeda_csv/anomalies/allAnomalies1.csv,1,anomalous,GET,/tienda1,,GET /tienda1
2,2,./torpeda_csv/anomalies/allAnomalies1.csv,2,anomalous,GET,/tienda1/,,GET /tienda1/
3,3,./torpeda_csv/anomalies/allAnomalies1.csv,3,anomalous,GET,/tienda1/publico/productos.jsp,,GET /tienda1/publico/productos.jsp
4,4,./torpeda_csv/anomalies/allAnomalies1.csv,4,anomalous,GET,/tienda1/publico/caracteristicas.jsp,id=2,GET /tienda1/publico/caracteristicas.jsp?id=2


In [4]:
df.keys()

Index(['Unnamed: 0', 'file', 'id', 'label', 'method', 'path', 'query', 'url'], dtype='object')

In [5]:
idx = df['Unnamed: 0']
url = df['url']
label = df['label']

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [8]:
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(3, 3), lowercase=False)
y = label

In [9]:
idx_train, idx_test, y_train, y_test = train_test_split(idx, y, test_size=0.3, stratify=y, random_state=42)
print(pd.value_counts(y_train))
print('---------------------------------')
print(pd.value_counts(y_test))

SQLi              30109
anomalous         11521
normal             5854
XSS                3373
SSI                 316
BufferOverflow      288
CRLFi               229
XPath               122
LDAPi                52
FormatString         29
Name: label, dtype: int64
---------------------------------
SQLi              12904
anomalous          4938
normal             2509
XSS                1445
SSI                 135
BufferOverflow      124
CRLFi                98
XPath                53
LDAPi                22
FormatString         12
Name: label, dtype: int64


In [10]:
url_train = [url[i] for i in idx_train]
url_test = [url[i] for i in idx_test]

In [11]:
X_train = vectorizer.fit_transform(url_train)
X_test = vectorizer.transform(url_test)

In [12]:
#logistic regression
lgs = LogisticRegression()
lgs.fit(X_train, y_train)
y_pred = lgs.predict(X_test)
acc_score_test = accuracy_score(y_test, y_pred)
c_matrix = confusion_matrix(y_test, y_pred)
f1_score_test = f1_score(y_test, y_pred, average='micro') 

print ("acc Logistic Regression : %.2f" % acc_score_test)
print ('F1  Logistic Regression : %.2f' % f1_score_test)
print ("Confusion Matrix: ")
print (c_matrix)



acc Logistic Regression : 0.99
F1  Logistic Regression : 0.99
Confusion Matrix: 
[[  119     0     0     0     0     2     0     3     0     0]
 [    0    98     0     0     0     0     0     0     0     0]
 [    0     0     8     0     3     0     0     0     1     0]
 [    0     0     0    21     1     0     0     0     0     0]
 [    0     0     0     0 12904     0     0     0     0     0]
 [    0     0     0     0     1   125     0     1     5     3]
 [    0     0     0     0    30     3    19     1     0     0]
 [    0     0     0     0    24     0     0  1421     0     0]
 [    0     0     0     0    11     0     0     1  4901    25]
 [    0     0     0     0     4     0     0     0    17  2488]]


In [14]:
df_train = df.iloc[idx_train]

In [15]:
df_test = df.iloc[idx_test]

In [17]:
df_train.to_csv('../data/torpeda_train_test//train.csv')
df_test.to_csv('../data/torpeda_train_test//test.csv')

In [18]:
import pickle

In [19]:
pickle.dump(vectorizer, open("../model/vectorizer.pickle", "wb"))
pickle.dump(lgs, open("../model/lgs.pickle", "wb"))