In [1]:
import numpy as np
import pandas as pd
import cv2
import tensorflow as tf 
import requests

from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv2D, MaxPooling2D,Flatten,Dropout,MaxPool2D, BatchNormalization
from sklearn import metrics

In [2]:
df=pd.read_csv('../datasets/sql-injection.csv', encoding='utf-8-sig')

df.head(5)

Unnamed: 0,Query,Label
0,""" or pg_sleep ( __TIME__ ) --",1
1,create user name identified by pass123 tempora...,1
2,AND 1 = utl_inaddr.get_host_address ( ...,1
3,select * from users where id = '1' or @ @1 ...,1
4,"select * from users where id = 1 or 1#"" ( ...",1


In [3]:
df=df[df.columns[-2:]]

df.head(5)

Unnamed: 0,Query,Label
0,""" or pg_sleep ( __TIME__ ) --",1
1,create user name identified by pass123 tempora...,1
2,AND 1 = utl_inaddr.get_host_address ( ...,1
3,select * from users where id = '1' or @ @1 ...,1
4,"select * from users where id = 1 or 1#"" ( ...",1


In [4]:
sentences=df['Query'].values

In [5]:
def convert_to_ascii(sentence):
    sentence_ascii = []

    for i in sentence:
        if ord(i) < 8222:
            sentence_ascii.append(134)
        if ord(i) == 8221: 
            sentence_ascii.append(129)
        if ord(i) == 8220: 
            sentence_ascii.append(130)
        if ord(i) == 8216:
            sentence_ascii.append(131)
        if ord(i) == 8217: 
            sentence_ascii.append(132)
        if ord(i) == 8211: 
            sentence_ascii.append(133)

        if ord(i) <= 128:
            sentence_ascii.append(ord(i))

    if len(sentence_ascii) > 10000:
        sentence_ascii = sentence_ascii[:10000]
    else:
        sentence_ascii += [0] * (10000 - len(sentence_ascii))
    
    zer = np.array(sentence_ascii).reshape((100, 100))
    return zer

In [6]:
arr=np.zeros((len(sentences),100,100))

for i in range(len(sentences)):
    
    image=convert_to_ascii(sentences[i])

    x=np.asarray(image,dtype='float')
    image =  cv2.resize(x, dsize=(100,100), interpolation=cv2.INTER_CUBIC)
    image/=128

    arr[i]=image

In [7]:
data = arr.reshape(arr.shape[0], 100, 100, 1)

In [8]:
data.shape

(30919, 100, 100, 1)

In [9]:
y=df['Label'].values

In [10]:
trainX, testX, trainY, testY = train_test_split(data,y, test_size=0.2, random_state=42)
trainX, validX, trainY, validY = train_test_split(trainX,trainY, test_size=0.2, random_state=42)

In [11]:
model=tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(64,(3,3), activation=tf.nn.relu, input_shape=(100,100,1)),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Conv2D(128,(3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Conv2D(256,(3,3), activation='relu'),
    tf.keras.layers.MaxPooling2D(2,2),
    
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [12]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 98, 98, 64)        640       
                                                                 
 max_pooling2d (MaxPooling2  (None, 49, 49, 64)        0         
 D)                                                              
                                                                 
 conv2d_1 (Conv2D)           (None, 47, 47, 128)       73856     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 23, 23, 128)       0         
 g2D)                                                            
                                                                 
 conv2d_2 (Conv2D)           (None, 21, 21, 256)       295168    
                                                                 
 max_pooling2d_2 (MaxPoolin  (None, 10, 10, 256)       0

In [13]:
batch_size = 128
num_epoch = 10

mc = ModelCheckpoint('../models/sql-injection.h5', save_best_only=True, save_weights_only=True)
model_log = model.fit(trainX, trainY,
          batch_size=batch_size,
          epochs=num_epoch,
          verbose=1,
          validation_data=( validX,  validY),
          callbacks=[mc]
                     )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
model.load_weights('../models/sql-injection.h5')

y_prob = model.predict(testX)
threshold = 0.5
y_pred = (y_prob > threshold).astype(int)

clf_report = metrics.classification_report(testY, y_pred, digits=4)
cnf_matrix = metrics.confusion_matrix(testY, y_pred)
TN, FP, FN, TP = cnf_matrix.ravel()
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)
FNR = FN / (FN + TP)

other_metrics = pd.DataFrame({'TPR': '%.4f' % TPR, 
                              'FPR': '%.4f' % FPR, 
                              'FNR': '%.4f' % FNR
                              }, index=[0]).to_string(col_space=9, index=False)
print(clf_report)
print(cnf_matrix)
print(other_metrics)

              precision    recall  f1-score   support

           0     0.9956    0.9995    0.9976      3893
           1     0.9991    0.9926    0.9958      2291

    accuracy                         0.9969      6184
   macro avg     0.9974    0.9960    0.9967      6184
weighted avg     0.9969    0.9969    0.9969      6184

[[3891    2]
 [  17 2274]]
      TPR       FPR       FNR
   0.9926    0.0005    0.0074


In [15]:
def sql_predict(model, sentence):
    preprocessed_sentence = convert_to_ascii(sentence)
    x = np.asarray(preprocessed_sentence, dtype='float')
    x = cv2.resize(x, dsize=(100, 100), interpolation=cv2.INTER_CUBIC)
    x /= 128
    x = np.expand_dims(x, axis=0) 
    x = np.expand_dims(x, axis=-1) 
    
    prediction = model.predict(x)
    
    return prediction[0][0]

def get_html_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print("Error while getting HTML", e)
        return None

test_url = "http://youtube.com"

your_sentence = get_html_from_url(test_url)
prediction = sql_predict(model, your_sentence)
if prediction > 0.5:
    print("Prediction: 1")
else:
    print("Prediction: 0")


Prediction: 0


In [19]:
def sql_predict(model, sentence):
    preprocessed_sentence = convert_to_ascii(sentence)
    x = np.asarray(preprocessed_sentence, dtype='float')
    x = cv2.resize(x, dsize=(100, 100), interpolation=cv2.INTER_CUBIC)
    x /= 128
    x = np.expand_dims(x, axis=0) 
    x = np.expand_dims(x, axis=-1) 
    
    prediction = model.predict(x)
    
    return prediction[0][0]

your_sentence = " select * from users where id  =  1 or ""  (  ,""  =  1 or 1  =  1 -- 1"
prediction = sql_predict(model, your_sentence)
if prediction > 0.5:
    print("Prediction: 1")
else:
    print("Prediction: 0")


Prediction: 1
