In [180]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Regex

In [181]:
import re
import json
import time
from urllib.parse import unquote
from urllib.parse import urlparse
import pandas as pd

PATTERN = r'^(GET|get|POST|post)\s(.*)(HTTP/\d{1}.\d{1})'
# URL_PUNCTUATIONS = '/+?&;=,()<>*!$#|^{}\~@.`[]:\'\"'

### Detecting URL encoding format

PATH_TRANSFORMATIONS = [ [r'[a-zA-z0-9\-\_]+', 'PathString']]

QUERY_INVISIBLECHAR_TRANSFORMATION = ['%00','%0a','%0b','%0c','%0d','%16','%01','%02','%03','%04','%05','%06','%07','%08','%09','%0e','%0f','%10','%11','%12','%13','%14','%15','%17','%18','%19','%1a','%1b','%1c','%1d','%1e','%1f']


QUERY_NUMBER_TRANSFORMATION = r'^[0-9]+$'
QUERY_PURE_STR_TRANSFORMATION = r'^[a-zA-Z\-\_]+$'
QUERY_UNICODE_STR_TRANSFORMATION = r'[\w]+'
QUERY_HEX_STR_TRANSFORMATION = r'^((0x|0X)?[a-fA-F0-9]{2})+$'
URL_ENCODED_PATTERN = r'.*\%[0-9a-fA-f]{2}.*'



QUERY_SQL_KEYWORD_TRANSFORMATION = ['waitfor','delay','space', 'case', 'upper', 'produce', 'primary', 'log', 'between', 'reverse', 'greatest', 'insert', 'outer', 'instr', 'length', 'replace', 'div', 'sqrt', 'set', 'min', 'any', 'group', 'character_length', 'key', 'and', 'inner', 'like', 'create', 'exp', 'top', 'exist', 'left', 'lcase', 'pow', 'rand', 'union', 'log2', 'index', 'is', 'abs', 'as', 'ltrim', 'max', 'having', 'delete', 'mod', 'check', 'select', 'values', 'foreign', 'view', 'concat', 'mid', 'add', 'find_in_set', 'format', 'char_length', 'substr', 'avg', 'update', 'desc', 'join', 'by', 'round', 'drop', 'strcmp', 'concat_ws', 'substring_index', 'trim', 'database', 'limit', 'rtrim', 'lpad', 'substring', 'rpad', 'count', 'locate', 'asc', 'log10', 'field', 'rownum', 'alter', 'unique', 'constraint', 'column', 'not', 'truncate', 'backup', 'table', 'where', 'all', 'position', 'ucase', 'repeat', 'lower', 'order', 'sum', 'or', 'in', 'into', 'right', 'ascii', 'distinct', 'from', 'null', 'floor', 'least', 'exec', 'default', 'if', 'else', 'end', 'convert', 'cast', 'information_schema', 'table_schema', 'column_name', 'all_tables', 'all_col_comments', 'sleep', 'pg_sleep' ]
QUERY_HTML_KEYWORD_TRANSFORMATION = ['script', 'document','location','document','cookie','history','body','img','onerror','onkeypress','console','fromCharCode','eval','svg','onload','div','javascript','prompt','contentWindow','img-src','autofocus',]
QUERY_JAVASCRIPT_TRANSFORMATION =['alert', 'onchange', 'onerror', 'img', 'onload', 'print', 'onmouseover', 'onfocus', 'onclick', 'onresize', 'javascript:', 'throw',  'script-src', 'script-src-elem',  'setTimeout', 'document.cookie',  'location.href','herf.iframe',]

QUERY_OS_COMMAND_TRANSFORMATION = ['alert','useradd', 'snap', 'hash', 'history', 'shasum', 'shutdown', 'chown', 'whatis', 'source', 'ps', 'shred', 'tar', 'echo', 'set', 'pwd', 'test', 'service', 'man', 'type', 'zip', 'netstat', 'ping', 'readarray', 'sudo', 'stat', 'sha1sum', 'userdel', 'exit', 'rm', 'who', 'apt', 'rmdir', 'top', 'vi', 'wc', 'which', 'until', 'locale', 'patch', 'times', 'export', 'scp', 'awk', 'base64', 'dpkg', 'alias', 'nano', 'printf', 'pushd', 'pacman', 'systemctl', 'neofetch', 'sha256sum', 'paste', 'timedatectl', 'dir', 'cd', 'nc', 'sh', 'unalias', 'tail', 'chsh', 'ssh', 'ss', 'touch', 'bash', 'grep', 'less', 'whoami', 'chmod', 'wget', 'curl', 'du', 'mv', 'unzip', 'perl', 'time', 'unset', 'sha512sum', 'batch', 'cp', 'hostnamectl', 'df', 'systemd', 'kill', 'wait', 'head', 'uname', 'popd', 'apt-get', 'telnet', 'hostname', 'tee', 'passwd', 'mkdir', 'read', 'python3', 'find', 'umask', 'variables', 'htop', 'host', 'su', 'more', 'cat', 'ls', 'sed', 'yum', 'python', 'vim']
QUERY_LFI_TRANSFORMATION =["etc", "htpasswd", "passwd", "system"]

QUERY_RFI_TRANSFORMATION =['http://', 'https://']
QUERY_CRLF_TRANSFORMATION =['%0d%0a',  '%0A ', '%0D ', '\r\n', '|0d 0a|']
# QUERY_XSS_TRANSFORMATION =['alert', 'onchange', 'onerror', 'img', 'onload', 'print', 'onmouseover', 'onfocus', 'onclick', 'location.href', 'herf.iframe', 'onresize', 'javascript:', 'throw', 'autofocus', 'script-src', 'script-src-elem', 'img-src', 'setTimeout', 'document.cookie', 'document.', 'contentWindow',]
QUERY_SSI_TRANSFORMATION =['#exec', '#config', '#echo', '#fsize', '#include ','cmd=','#flastmod']
# QUERY_PATH_TRANSFORMATION =["../", '"..\"', ""]
QUERY_XXE_TRANSFORMATION =['!DOCTYPE', '!ENTITY', ' SYSTEM', 'xmlns']
EXTENSION_TRANSFORMATION = ['.exe','.cgi','.py','.c','.php','.sh','.bin','.tar','.dll','.zip','.stm','.shtml','.shtm']
# coi lại dataset RFI , LFI / bo xss -> js , html/ bo SSI / them extension / bo invisble



# Path

In [182]:
## HTML method: 0-GET, 1-POST
DATASET_FEATURES = ['./','.\\','<!--','-->','/*','*/','--',"'",'/\\','&',';','=',',','\\','"','()','<>','*','!','#','|','~','.',':','OtherSpecialChar','SQLKeyword','HTMLKeyword','JavaScript','Command','Number','PureString','HexString','UnicodeString','MixString','LFI','RFI','CRLF','Label']
for i in DATASET_FEATURES:
  print(i)

# DATASET_PATH = "/content/drive/MyDrive/UIT/HocKy8/KLTN/datasets/dataset1/OK_fwaf-goodqueries.txt"
# PARSED_DATASET_PATH = '/content/drive/MyDrive/UIT/HocKy8/KLTN/datasets/dataset1/matrix1/normal/OK_fwaf-goodqueries.csv'
DATASET_LABELS = 1
DATASET_PATH = "/content/drive/MyDrive/UIT/HocKy8/KLTN/datasets/dataset1/OK_fwaf-badqueries.txt"
PARSED_DATASET_PATH = '/content/drive/MyDrive/UIT/HocKy8/KLTN/datasets/dataset1/matrix1/normal/OK_fwaf-badqueries.csv'

# DATASET_PATH = "/content/drive/MyDrive/UIT/HocKy8/KLTN/datasets/dataset1/OK_csic2010-normal.txt"
# PARSED_DATASET_PATH = '/content/drive/MyDrive/UIT/HocKy8/KLTN/datasets/dataset1/matrix1/normal/OK_csic2010-normal.csv'
# DATASET_PATH = "/content/drive/MyDrive/UIT/HocKy8/KLTN/datasets/dataset1/OK_csic2010-anomalous.txt"
# PARSED_DATASET_PATH = '/content/drive/MyDrive/UIT/HocKy8/KLTN/datasets/dataset1/matrix1/normal/OK_csic2010-anomalous.csv'
# DATASET_LABELS = 1

# DATASET_PATH = "/content/drive/MyDrive/UIT/HocKy8/KLTN/datasets/dataset1/OK_http-params-normal.txt"
# PARSED_DATASET_PATH = '/content/drive/MyDrive/UIT/HocKy8/KLTN/datasets/dataset1/matrix1/normal/OK_http-params-normal.csv'
# DATASET_LABELS = 1

# DATASET_PATH = "/content/drive/MyDrive/UIT/HocKy8/KLTN/datasets/dataset1/OK_http-params-anomalous.txt"
# PARSED_DATASET_PATH = '/content/drive/MyDrive/UIT/HocKy8/KLTN/datasets/dataset1/matrix1/normal/OK_http-params-anomalous.csv'

./
.\
<!--
-->
/*
*/
--
'
/\
&
;
=
,
\
"
()
<>
*
!
#
|
~
.
:
OtherSpecialChar
SQLKeyword
HTMLKeyword
JavaScript
Command
Number
PureString
HexString
UnicodeString
MixString
LFI
RFI
CRLF
Label


# parse_data_from_request

In [183]:
def parse_data_from_request(request):

    request_transforming_matrix = [0]*(len(DATASET_FEATURES)-1)

    request = request.replace('\r', '').strip('\r\n')
    fields = request.split('\n')
    # print("fields", fields)
    if len(fields) < 1:
        print('Invalid request.\n' %(request))
        return []

    data = ''
    # if fields[0].lower().startswith('get'):
    elements = fields[0].split()
    elements = list(filter(None, elements))
    # print("elements",elements)
    #     if elements[0] and elements[0].lower() != 'get':
    #         print('Invalid request.\n' %(request))
    #         return []
    if elements[0]:
        data = elements[0].strip()
    else:
        return []
    # print("data: ",data)

  
    ### Get URL path and query
    try:
        url = urlparse(data)
        data = '%s %s' %(url.path, url.query)
    except:
        print('Invalid URL: %s'%(data))
    
    data = data.lower()
    # print("data 2: ",data)

   
    ### Find ./
    c = './'
    request_transforming_matrix[DATASET_FEATURES.index('./')] += data.count(c)
    data = data.replace(c,' ')

    ### Find .\
    c = '.\\'
    request_transforming_matrix[DATASET_FEATURES.index('.\\')] += data.count(c)
    data = data.replace(c,' ')

    ### Find <!--
    c = '<!--'
    request_transforming_matrix[DATASET_FEATURES.index('<!--')] += data.count(c)
    data = data.replace(c,' ')

    ### Find -->
    c = '-->'
    request_transforming_matrix[DATASET_FEATURES.index('-->')] += data.count(c)
    data = data.replace(c,' ')

    ### Find /*
    c = '/*'
    request_transforming_matrix[DATASET_FEATURES.index('/*')] += data.count(c)
    data = data.replace(c,' ')

    ### Find */
    c = '*/'
    request_transforming_matrix[DATASET_FEATURES.index('*/')] += data.count(c)
    data = data.replace(c,' ')

    ### Find --
    c = '--'
    request_transforming_matrix[DATASET_FEATURES.index('--')] += data.count(c)
    data = data.replace(c,' ')

    ### Find /\
    c = '"/\"'
    request_transforming_matrix[DATASET_FEATURES.index('/\\')] += data.count(c)
    data = data.replace(c,' ')
    
    ### Find \
    c = '\\'
    request_transforming_matrix[DATASET_FEATURES.index('/\\')] += data.count(c)
    data = data.replace(c,' ')

    ### Find +
    c = '+'
    request_transforming_matrix[DATASET_FEATURES.index('OtherSpecialChar')] += data.count(c)
    data = data.replace(c,' ')

    ### Find ?
    c = '?'
    request_transforming_matrix[DATASET_FEATURES.index('OtherSpecialChar')] += data.count(c)
    data = data.replace(c,' ')

    ### Find &
    c = '&'
    request_transforming_matrix[DATASET_FEATURES.index('&')] += data.count(c)
    data = data.replace(c,' ')

    ### Find ;
    c = ';'
    request_transforming_matrix[DATASET_FEATURES.index(';')] += data.count(c)
    data = data.replace(c,' ')

    ### Find =
    c = '='
    request_transforming_matrix[DATASET_FEATURES.index('=')] += data.count(c)
    data = data.replace(c,' ')

    ### Find ,
    c = ','
    request_transforming_matrix[DATASET_FEATURES.index(',')] += data.count(c)
    data = data.replace(c,' ')

    ### Find '
    c = "'\''"
    request_transforming_matrix[DATASET_FEATURES.index('\'')] += data.count(c)
    data = data.replace(c,' ')

    ### Find ""
    c = '"'
    request_transforming_matrix[DATASET_FEATURES.index('"')] += data.count(c)
    data = data.replace(c,' ')

    ### Find (
    c = '('
    request_transforming_matrix[DATASET_FEATURES.index('()')] += data.count(c)
    data = data.replace(c,' ')
    ### Find )
    c = ')'
    request_transforming_matrix[DATASET_FEATURES.index('()')] += data.count(c)
    data = data.replace(c,' ')

    ### Find <
    c = '<'
    request_transforming_matrix[DATASET_FEATURES.index('<>')] += data.count(c)
    data = data.replace(c,' ')

    ### Find >
    c = '>'
    request_transforming_matrix[DATASET_FEATURES.index('<>')] += data.count(c)
    data = data.replace(c,' ')

    ### Find *
    c = '*'
    request_transforming_matrix[DATASET_FEATURES.index('*')] += data.count(c)
    data = data.replace(c,' ')

    ### Find !
    c = '!'
    request_transforming_matrix[DATASET_FEATURES.index('!')] += data.count(c)
    data = data.replace(c,' ')

    ### Find $
    c = '$'
    request_transforming_matrix[DATASET_FEATURES.index('OtherSpecialChar')] += data.count(c)
    data = data.replace(c,' ')

    ### Find #
    c = '#'
    request_transforming_matrix[DATASET_FEATURES.index('#')] += data.count(c)
    data = data.replace(c,' ')

    ### Find |
    c = '|'
    request_transforming_matrix[DATASET_FEATURES.index('|')] += data.count(c)
    data = data.replace(c,' ')

    ### Find ^
    c = '^'
    request_transforming_matrix[DATASET_FEATURES.index('OtherSpecialChar')] += data.count(c)
    data = data.replace(c,' ')

    ### Find {
    c = '{'
    request_transforming_matrix[DATASET_FEATURES.index('OtherSpecialChar')] += data.count(c)
    data = data.replace(c,' ')

    ### Find }
    c = '}'
    request_transforming_matrix[DATASET_FEATURES.index('OtherSpecialChar')] += data.count(c)
    data = data.replace(c,' ')

    ### Find %
    c = '%'
    request_transforming_matrix[DATASET_FEATURES.index('OtherSpecialChar')] += data.count(c)
    data = data.replace(c,' ')

    ### Find ~
    c = '~'
    request_transforming_matrix[DATASET_FEATURES.index('~')] += data.count(c)
    data = data.replace(c,' ')

    ### Find @
    c = '@'
    request_transforming_matrix[DATASET_FEATURES.index('OtherSpecialChar')] += data.count(c)
    data = data.replace(c,' ')

    ### Find .
    c = '.'
    request_transforming_matrix[DATASET_FEATURES.index('.')] += data.count(c)
    data = data.replace(c,' ')

    ### Find `
    c = '`'
    request_transforming_matrix[DATASET_FEATURES.index('OtherSpecialChar')] += data.count(c)
    data = data.replace(c,' ')

    ### Find [
    c = '['
    request_transforming_matrix[DATASET_FEATURES.index('OtherSpecialChar')] += data.count(c)
    data = data.replace(c,' ')

    ### Find ]
    c = ']'
    request_transforming_matrix[DATASET_FEATURES.index('OtherSpecialChar')] += data.count(c)
    data = data.replace(c,' ')

    ### Find :
    c = ':'
    request_transforming_matrix[DATASET_FEATURES.index(':')] += data.count(c)
    data = data.replace(c,' ')

    ### split data
    data = data.split()
    data = list(filter(None, data))
    # print("data 3: ",data)
    ### Find SQLKeyword, OSCommand, Numbers, PureString, UnicodeString, HexString, MixString
    for i in range(0, len(data)):
        if data[i] in QUERY_SQL_KEYWORD_TRANSFORMATION:
            request_transforming_matrix[DATASET_FEATURES.index('SQLKeyword')] += 1
        elif data[i] in QUERY_HTML_KEYWORD_TRANSFORMATION:
            request_transforming_matrix[DATASET_FEATURES.index('HTMLKeyword')] += 1
        elif data[i] in QUERY_OS_COMMAND_TRANSFORMATION:
            request_transforming_matrix[DATASET_FEATURES.index('Command')] += 1
        elif data[i] in QUERY_OS_COMMAND_TRANSFORMATION:
            request_transforming_matrix[DATASET_FEATURES.index('LFI')] += 1
        elif data[i] in QUERY_OS_COMMAND_TRANSFORMATION:
            request_transforming_matrix[DATASET_FEATURES.index('RFI')] += 1
        elif data[i] in QUERY_OS_COMMAND_TRANSFORMATION:
            request_transforming_matrix[DATASET_FEATURES.index('CRLF')] += 1
        elif data[i] in QUERY_OS_COMMAND_TRANSFORMATION:
            request_transforming_matrix[DATASET_FEATURES.index('XSS')] += 1
        elif data[i] in QUERY_OS_COMMAND_TRANSFORMATION:
            request_transforming_matrix[DATASET_FEATURES.index('SSI')] += 1
        elif data[i] in QUERY_OS_COMMAND_TRANSFORMATION:
            request_transforming_matrix[DATASET_FEATURES.index('XXE')] += 1
        elif data[i] in QUERY_JAVASCRIPT_TRANSFORMATION:
            request_transforming_matrix[DATASET_FEATURES.index('JavaScript')] += 1
        elif re.fullmatch(QUERY_NUMBER_TRANSFORMATION, data[i]):
            request_transforming_matrix[DATASET_FEATURES.index('Number')] += 1
        elif re.fullmatch(QUERY_PURE_STR_TRANSFORMATION, data[i]):
            request_transforming_matrix[DATASET_FEATURES.index('PureString')] += 1
        elif re.fullmatch(QUERY_HEX_STR_TRANSFORMATION, data[i]):
            request_transforming_matrix[DATASET_FEATURES.index('HexString')] += 1
        elif re.fullmatch(QUERY_UNICODE_STR_TRANSFORMATION, data[i]):
            request_transforming_matrix[DATASET_FEATURES.index('UnicodeString')] += 1
      
        else:
            request_transforming_matrix[DATASET_FEATURES.index('MixString')] += 1

    for i in range(0, len(request_transforming_matrix)):
        if request_transforming_matrix[i] > 255:
            request_transforming_matrix[i] = 255

    # print(" len %request_transforming_matrix ", len(request_transforming_matrix))
    # print('%s\n' %request_transforming_matrix)
    return request_transforming_matrix
  

# Read request from file

> Normal ver



In [184]:
def read_requests_from_file(file):
    matrix = {}
    for f in DATASET_FEATURES:
        matrix[f] = []

    parsed_lines = 0
    parsed_file = open(PARSED_DATASET_PATH, "w")
    with open(file,"r") as fi:
        request = ''
        for ln in fi:
            if ln:
                # if re.match(PATTERN, ln):
                    if request:
                        request = ln.strip('\r\n')
                        # print("requets in parse_data_from_request ",request)
                        ### Put to matrix
                        data = parse_data_from_request(request)
                        # print("Put to matrix, 2")

                        # print("data in parse_data_from_request ", data )
                        # print("data in parse_data_from_request ", len(data) )
                        for i in range(0, len(DATASET_FEATURES)-1):
                            matrix[DATASET_FEATURES[i]].append(data[i])
                        matrix[DATASET_FEATURES[-1]].append(DATASET_LABELS)

                        parsed_lines += 1
                        request = ''
                        request = ln
                    else:
                        request = ''
                        request += ln
                # else:
                    # request += ln
    request = request.strip('\r\n')
    # print("requets in parse_data_from_request ",request)
    
    ### Put to matrix
    # data = parse_data_from_request(request)
    # print("Put to matrix, 2.1")
    # print("data in parse_data_from_request ", len(data) )

    # print("data in parse_data_from_request ", data )
    for i in range(0, len(DATASET_FEATURES)-1):
        matrix[DATASET_FEATURES[i]].append(data[i])
    matrix[DATASET_FEATURES[-1]].append(DATASET_LABELS)

    parsed_lines += 1

    # try:
    #   df = pd.DataFrame(matrix)
    # except:
    #   print("matrix",matrix)
    # print("matrix",matrix)
    # print("len matrix",len(matrix['./']))
    df = pd.DataFrame(matrix)
      

    df.to_csv(PARSED_DATASET_PATH, index=False)

    return parsed_lines

# Result

In [185]:
print("1")
parsed_lines = read_requests_from_file(DATASET_PATH)
print('%s requests are parsed.' %parsed_lines)

1
47069 requests are parsed.
