# Detect malicious and normal request using logistic regression.

In [None]:
#using 'time' for time related operation
#using 'urlib' for http related operation
#using 'html' for html related operation
import os
import urllib
import time 
import html
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
def get_query_list(filename):
    filepath = "../input/malicious-request-dataset/" + filename
    data = open(filepath, 'r', encoding='UTF-8').readlines()
    query_list = []
    for d in data:
        # decoding(解码)
        d = str(urllib.parse.unquote(d))   # converting url encoded data to simple string
        query_list.append(d)
    return list(set(query_list))

### Combining N-grams & TF-IDF to construct feature matrix
#### Background knowledge: 
TF-IDF(Term Frequency-InversDocument Frequency) is useful in information processing & data mining. This technique implements a statistical method to compute the significance of a certain character according to its  frequency of appearing in the text and the frequency of the document appearing in the corpus. It has the advantage of filtering out some common but irrelevant words while retaining the important words that influence the whole text. The larger the TF-IDF value is, the more important the feature is to the whole text. The basic idea is to transform text into feature matrix and reduce the weight of common words (such as we, all, WWW, etc.) so as to better show the value of a text.
Consult：https://www.jianshu.com/p/e2a0aea3630c
 
#### N-grams：Consult https://blog.csdn.net/songbinxu/article/details/80209197

#### Thought based on our scenario：
No matter malicious request & normal request are both lists of variable length strings, making it difficult to process these irregular data directly by logistic regression algorithm. Thus, we need to find the numerical characteristics of these texts to train our detection model. So that is why we talk about TF-IDF. It can be used to show the characteristics of the text and output in the form of digital matrices. Before calculating TD-IDF, the content of each document (URL request) needs to be divided into words, namely defining the length of the entry of these document. Here, n-grams of length 3 is selected, which can be adjusted according to the accuracy of the model.

### Word Segmentation(分词处理)
Since there is no blank space to divide URL request, so first we have to use word segmentation technique to process it. Here we choose N-murgrams with a length of 3, and it can be adjusted according to the accuracy of the model. 
References：https://www.zhihu.com/question/266054946?sort=created

In [None]:
# tokenizer function, this will make 3 grams of each query
# eg: www.foo.com/1 will be transformed to ['www','ww.','w.f','.fo','foo','oo.','o.c','.co','com','om/','m/1']
def get_ngrams(query):
    tempQuery = str(query)
    ngrams = []
    for i in range(0, len(tempQuery)-3):
        ngrams.append(tempQuery[i:i+3])
    return ngrams

### Dataset discryption:
Goodqueries.txt: 1265974 pieces of data, derive from the logging request of http://secrepo.com.   
Badqueries.txt: 44532 pieces of data, derive from https://github.com/foospidy/payloads, including XSS, SQL injection etc.

In [None]:
# The main function
if __name__ == '__main__':
    
    # Get normal request and print some expamles
    good_query_list = get_query_list('goodqueries.txt')
    print(u"Normal Request: ", len(good_query_list)) # using unicode to encode characters
    print(u"For Example:")
    for  i in range(0, 5):
        print(good_query_list[i].strip('\n'))
    print("\n")
        
    # Get malicious request and print some examples
    bad_query_list = get_query_list('badqueries.txt')
    print(u"Malicious Request: ", len(bad_query_list))
    print(u"For Example:")
    for  i in range(0, 5):
        print(bad_query_list[i].strip('\n'))
    print("\n")

    # Preprocessing (预处理 good_y标记为0 bad_y标记为1)
    good_y = [0 for i in range(0, len(good_query_list))]
    print(good_y[:5])
    bad_y = [1 for i in range(0, len(bad_query_list))]
    print(bad_y[:5])
    
    queries = bad_query_list + good_query_list
    y = bad_y + good_y

    # converting data to vectors
    # TfidfTransformer + CountVectorizer  =  TfidfVectorizer
    # sklearn.feature_extraction.text.TfidfVectorizer() is used to convert a collection of raw documents to a matrix of TF-IDF features.
    vectorizer = TfidfVectorizer(tokenizer=get_ngrams)

    # Preparing for model training（把不规律的文本字符串列表转换成规律的 ([i,j], tdidf值) 的矩阵X)
    # (用于下一步训练逻辑回归分类器)
    X = vectorizer.fit_transform(queries)
    print(X.shape)

    # Split dataset for teaing and testing
    # (使用train_test_split分割X,y列表)
    # (X_train矩阵的数目对应y_train列表的数目(一一对应),用来训练模型)
    # (X_test矩阵的数目对应y_test列表的数目(一一对应),用来测试模型的准确性)
    # Consult: https://blog.csdn.net/qq_39355550/article/details/82688014
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=20, random_state=42)

    # Train the model and test it
    # (使用逻辑回归方法模型)
    LR = LogisticRegression()
    # (训练模型)
    LR.fit(X_train, y_train)
    # (对模型的准确度进行计算)
    print('模型的准确度:{}'.format(LR.score(X_test, y_test)))
    print("\n")
    # (对新的请求列表进行预测)
    new_queries = ['www.foo.com/id=1<script>alert(1)</script>',
                   'www.foo.com/name=admin\' or 1=1','abc.com/admin.php',
                   '"><svg onload=confirm(1)>',
                   'test/q=<a href="javascript:confirm(1)>',
                   'q=../etc/passwd',
                   '/stylesheet.php?version=1331749579',
                   '/<script>cross_site_scripting.nasl</script>.idc',
                   '<img \x39src=x onerror="javascript:alert(1)">',
                   '/jhot.php?rev=2 |less /etc/passwd']
    # 矩阵转换
    X_predict = vectorizer.transform(new_queries)
    res = LR.predict(X_predict)

    #Print the result
    res_list = []
    for q,r in zip(new_queries, res):
        tmp = 'Normal Request' if r == 0 else 'Malicious Request'
        q_entity = html.escape(q)
        # Consult: https://www.jianshu.com/p/d896e3017417
        res_list.append({'url':q_entity,'res':tmp})

    for n in res_list:
        print(n)