# SQLi Tokenizer

In this notebook we train a tokenizer and save it to file.

In [4]:
import os, datetime
import numpy as np
import joblib

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer,text_to_word_sequence

## Collect HTML files

Load the HTML files

In [6]:
files = []

for f in os.listdir(os.path.join(os.getcwd(),'html')):
    print(os.path.join(os.getcwd(),'html',f))
    fd = open(os.path.join(os.getcwd(),'html',f),'r')
    content = fd.read()
    files.append(content)
    fd.close()

D:\Studies\PhD\papers\Capture_The_Flag_ML\CTF-RepresentationLearning\html\correctescape.html
D:\Studies\PhD\papers\Capture_The_Flag_ML\CTF-RepresentationLearning\html\flag.html
D:\Studies\PhD\papers\Capture_The_Flag_ML\CTF-RepresentationLearning\html\jabbahome.html
D:\Studies\PhD\papers\Capture_The_Flag_ML\CTF-RepresentationLearning\html\wronglogin.html


Show a sample file

In [7]:
files[0]

'\n<!-- saved from url=(0042)http://jabba.hackingarena.no:806/index.php -->\n<html><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"></head><body>Login as admin for the flag!\n<br>some content, some content\n<form action="http://jabba.hackingarena.no:806/index.php" method="post">\n<table width="100">\n<tbody><tr><td>Name:</td>\n<td><input type="text" name="username" value=""></td></tr>\n<tr><td>Password:</td>\n<td><input type="text" name="passwd" value=""></td></tr>\n\n<tr><td><input type="submit" value="Submit"></td></tr>\n</tbody></table>\n</form>\n\n</body></html>\n'

Add a mock file as a concatentation of a previous one

In [8]:
files.append(files[0]*10)

## Defining the tokenizer

Define the tokenization, padding and trimming parameters

In [9]:
lower = False
filters = '"#$%&()*+,.;?@[\\]^_`{|}~\t\n'

Show an example of tokenization

In [10]:
text_to_word_sequence(files[0],lower=lower, filters=filters)

['<!--',
 'saved',
 'from',
 'url=',
 '0042',
 'http://jabba',
 'hackingarena',
 'no:806/index',
 'php',
 '-->',
 '<html><head><meta',
 'http-equiv=',
 'Content-Type',
 'content=',
 'text/html',
 'charset=UTF-8',
 '></head><body>Login',
 'as',
 'admin',
 'for',
 'the',
 'flag!',
 '<br>some',
 'content',
 'some',
 'content',
 '<form',
 'action=',
 'http://jabba',
 'hackingarena',
 'no:806/index',
 'php',
 'method=',
 'post',
 '>',
 '<table',
 'width=',
 '100',
 '>',
 '<tbody><tr><td>Name:</td>',
 '<td><input',
 'type=',
 'text',
 'name=',
 'username',
 'value=',
 '></td></tr>',
 '<tr><td>Password:</td>',
 '<td><input',
 'type=',
 'text',
 'name=',
 'passwd',
 'value=',
 '></td></tr>',
 '<tr><td><input',
 'type=',
 'submit',
 'value=',
 'Submit',
 '></td></tr>',
 '</tbody></table>',
 '</form>',
 '</body></html>']

## Fit the tokenizer

Learn a tokenization dictionary and convert the pages into token sequences:

In [11]:
SQLi_tokenizer = Tokenizer(lower=lower, filters=filters)
SQLi_tokenizer.fit_on_texts(files)

## Saving the tokenizer

Saving the tokenizer

In [12]:
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S%f")
joblib.dump(SQLi_tokenizer,'ignore_tokenizer_'+timestamp)

['ignore_tokenizer_20210318153153509112']