In [37]:
#init
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.metrics import accuracy_score
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
import warnings
from sklearn.decomposition import TruncatedSVD 
import time
warnings.filterwarnings('ignore')

In [38]:
def evalution(y_pre,y_true):
    ny_pre = np.asarray(y_pre)
    ny_pre[ny_pre==1] = 0
    ny_pre[ny_pre==-1] = 1

    ny_true = np.asarray(y_true)

    class_names = ['normal','dangours']
    print(accuracy_score(ny_true,ny_pre))
    cnf_matrix = confusion_matrix(ny_true, ny_pre) 
    print(cnf_matrix)

def train(model,X_train,X_valid):
    clf=model()
    clf.fit(X_train)
    y_pre=clf.predict(X_valid)
    return clf,y_pre

def test(clf,X_test,y_test):
    y_pre = clf.predict(X_test)
    ny_pre = np.asarray(y_pre)
    ny_pre[ny_pre==1] = 0
    ny_pre[ny_pre==-1] = 1
    ny_test = np.asarray(y_test)
    print(accuracy_score(y_test,ny_pre))
    cnf_matrix = confusion_matrix(ny_test, ny_pre) 
    print(cnf_matrix)
    return y_pre

def FPFN(data,y,y_pre):
    print('False positives')
    print(data[(y == 0) & (y_pre == 1)])
    print('False negatives')
    print(data[(y == 1) & (y_pre == 0)])
#误报漏报再分析
#FPFN(data,y,y_pre)

In [39]:
#data
good_data=pd.read_csv('normal_100000.csv',names=['url'],nrows=10000)
good_data['label']=0
bad_data=pd.read_csv('XSS_1000.csv',names=['url'],nrows=1000)
bad_data['label']=1
data=pd.concat([good_data,bad_data]).reset_index(drop=True)
y=data['label']

In [40]:
#bad_data=pd.read_csv('bad_url.csv',names=['url'],nrows=100)
#bad_data['label']=1
#data=pd.concat([good_data,bad_data]).reset_index(drop=True)
#y=data['label']

In [41]:
#feature
X_train, X_valid, y_train, y_valid = train_test_split(data['url'].values.astype('U'), data['label'].values, test_size=0.2, random_state=42) #splitting data
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3)) #converting data to vectors
X_train = vectorizer.fit_transform(X_train)
X_valid = vectorizer.transform(X_valid)
svd = TruncatedSVD(n_components=10, n_iter=7, random_state=42)
X_train=svd.fit_transform(X_train) 
X_valid=svd.transform(X_valid)

In [42]:
#dataformat
X_train.shape,y_train.shape,X_valid.shape,y_valid.shape

((8800, 10), (8800,), (2200, 10), (2200,))

In [43]:
#OneClassSVM
starttime=time.time()
clf,y_pre=train(OneClassSVM,X_train,X_valid)
endtime=time.time()
print("Time:{}".format(endtime-starttime))
evalution(y_pre,y_valid)

Time:1.8511710166931152
0.5340909090909091
[[1037  950]
 [  75  138]]


In [44]:
#testing
bad_data=pd.read_csv('XSS_1000_2.csv',names=['url'])
bad_data['label']=1
good_data=pd.read_csv('normal_test.csv',names=['url'])
good_data['label']=0
data=pd.concat([good_data,bad_data]).reset_index(drop=True)
X = vectorizer.transform(data['url'].values.astype('U'))
X=svd.transform(X) 
y=data['label'].values
y_pre=test(clf,X,y)

0.5372380952380953
[[10622  9378]
 [  340   660]]


In [45]:
#IsolationForest
starttime=time.time()
clf,y_pre=train(IsolationForest,X_train,X_valid)
endtime=time.time()
print("Time:{}".format(endtime-starttime))
evalution(y_pre,y_valid)

Time:0.511282205581665
0.8440909090909091
[[1810  177]
 [ 166   47]]


In [46]:
#tesing
y_pre=test(clf,X,y)

0.8831428571428571
[[18343  1657]
 [  797   203]]


In [47]:
#LocalOutlierFactor
starttime=time.time()
clf = LocalOutlierFactor(n_neighbors=20,novelty=True)
clf.fit(X_train)
y_pre = clf.predict(X_valid)
endtime=time.time()
print("Time:{}".format(endtime-starttime))
evalution(y_pre,y_valid)

Time:0.9062089920043945
0.8145454545454546
[[1770  217]
 [ 191   22]]


In [48]:
#testing
y_pre=test(clf,X,y)

0.859904761904762
[[17965  2035]
 [  907    93]]
