In [1]:
import pandas as pd
df = pd.read_csv("./data/sqli.csv", encoding='utf-16')

In [3]:
df['Sentence'] = df['Sentence'].apply(lambda x: str(x).lower())

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2))

In [7]:
X = vectorizer.fit_transform(df['Sentence'].values.astype('U')).toarray()
y = df['Label']

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
from sklearn.naive_bayes import GaussianNB
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

In [10]:
from sklearn.metrics import accuracy_score
y_pred = nb_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.9773809523809524

In [21]:
import nltk
from nltk.tokenize import RegexpTokenizer

def RETokenizer(query):
    pattern = r'\$?\w+'
    tokenizer = RegexpTokenizer(pattern)
    tokens = tokenizer.tokenize(query)
    return tokens

In [None]:
queries = []

vulnerable = []
for q in queries:
    tokens = RETokenizer(q)
    variables = [token for token in tokens if token.startswith("$") and q.index("WHERE") and q.index(token) > q.index("WHERE")]
    if len(variables) > 0:
        vulnerable.append((q, variables))

In [11]:
query = "SELECT * FROM orders WHERE order_date >= '$start_date' AND order_total < $max_total AND order_status = $status AND order_id IN ($order_ids) AND order_amount BETWEEN :min_amount AND $max_amount AND customer_id = $customer_id AND order_is_active = $is_active"
query_features = vectorizer.transform([query]).toarray()
prediction = nb_clf.predict(query_features)[0]

In [15]:
tokens = RETokenizer(query)

In [16]:
def sanitize(token):
   return token.replace("'", "''")

In [17]:
variables = []
for token in tokens:
    if token.startswith("$") and query.index(token) > query.index("WHERE"):
        datatype = "int"
        if "date" in token:
            datatype = "date"
        if token.startswith("$is"):
            datatype = "bool"
        if query[query.index(token) - 1] in ["'", "\""]:
            datatype = "str"
        variables.append((token, datatype))


In [18]:
pstmt = query
for name, datatype in variables:
    placeholder = "?"
    if datatype == "date":
        placeholder = "TO_DATE(?, 'YYYY-MM-DD')"
    pstmt = pstmt.replace(name, placeholder, 1)

In [19]:
print("Statement type", tokens[0])
print(pstmt)
for i, var in enumerate(variables):
    print("bind(" + var[0] + ", " + str(i) + ")")

Statement type SELECT
SELECT * FROM orders WHERE order_date >= '?' AND order_total < ? AND order_status = ? AND order_id IN (?) AND order_amount BETWEEN :min_amount AND ? AND customer_id = ? AND order_is_active = ?
bind($start_date, 0)
bind($max_total, 1)
bind($status, 2)
bind($order_ids, 3)
bind($max_amount, 4)
bind($customer_id, 5)
bind($is_active, 6)
