In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


dataset\sqli.csv
dataset\sqliv2.csv
dataset\SQLiV3.csv


In [2]:
df3 = pd.read_csv('dataset\SQLiV3.csv')

# Keep only the first two columns
df3 = df3.iloc[:, :2]
df3.columns = ['Sentence', 'Label']

# Filter out corrupted labels (keep only 0 and 1 as strings)
df3 = df3[df3['Label'].isin(['0', '1'])]

# Convert labels to integers
df3['Label'] = df3['Label'].astype(int)
df3['Label'].unique()

array([1, 0])

In [3]:
df2=pd.read_csv('dataset\sqliv2.csv' , encoding='utf-16')
df2['Label'].unique()

array([1, 0], dtype=int64)

In [4]:
df1=pd.read_csv('dataset\sqli.csv', encoding='utf-16')
df1['Label'].unique()

array([1, 0], dtype=int64)

Unnamed: 0,Sentence,Label
0,""" or pg_sleep ( __TIME__ ) --",1
2,AND 1 = utl_inaddr.get_host_address ( ...,1
3,select * from users where id = '1' or @ @1 ...,1
4,"select * from users where id = 1 or 1#"" ( ...",1
5,select name from syscolumns where id = ...,1
...,...,...
30914,DELETE FROM door WHERE grow = 'small',0
30915,DELETE FROM tomorrow,0
30916,SELECT wide ( s ) FROM west,0
30917,SELECT * FROM ( SELECT slide FROM breath ),0


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

# Combine datasets
df = pd.concat([df1, df2, df3], ignore_index=True)

# Preprocessing
df['Sentence'] = df['Sentence'].astype(str)  # assuming 'Query' column has the SQL
df = df.dropna(subset=['Sentence', 'Label'])  # drop nulls
print(df.head())
df['Word_Count'] = df['Sentence'].apply(lambda x: len(x.split()))
print(df.head())
df = df[df['Word_Count'] > 2]  # remove short queries
print(df.head())


          Sentence  Label
0                a      1
1              a'       1
2            a' --      1
3  a' or 1 = 1; --      1
4                @      1
          Sentence  Label  Word_Count
0                a      1           1
1              a'       1           1
2            a' --      1           2
3  a' or 1 = 1; --      1           6
4                @      1           1
                    Sentence  Label  Word_Count
3            a' or 1 = 1; --      1           6
6   ' and 1 = 0 )  union all      1           8
7              ? or 1 = 1 --      1           6
8  x' and userid is NULL; --      1           6
9   x' and email is NULL; --      1           6


In [9]:
df['Label'].value_counts()

Label
1    23437
0    20504
Name: count, dtype: int64

In [10]:

# TF-IDF Vectorization (max 544 features)
vectorizer = TfidfVectorizer(max_features=544)
X = vectorizer.fit_transform(df['Sentence']).toarray()

# Labels
y = df['Label']

print(X.shape)  # Should be (46392, 544)
print(y.shape)



(43941, 544)
(43941,)


In [28]:
import joblib
vectorizer = TfidfVectorizer(max_features=544)
vectorizer.fit(df['Sentence'])
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [13]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the input layer
input_layer = layers.Input(shape=(544,))

# Encoder
encoder_dense1 = layers.Dense(128, activation='relu')(input_layer)
latent = layers.Dense(64, activation='relu')(encoder_dense1)

# Decoder
decoder_dense1 = layers.Dense(128, activation='relu')(latent)
output_layer = layers.Dense(544, activation='sigmoid')(decoder_dense1)

# Build the model
autoencoder = models.Model(inputs=input_layer, outputs=output_layer)

# Compile the model
autoencoder.compile(optimizer='adam', loss='mse')

# Model summary
autoencoder.summary()


In [14]:
from sklearn.model_selection import train_test_split
import numpy as np

# Example: let's assume your data is stored in a NumPy array called `X`
# X = your input data of shape (num_samples, 544)

# Split the data
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)
y_train,y_test=train_test_split(y,test_size=0.2,random_state=42)
print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")
print(y_train.shape)

Training set shape: (35152, 544)
Testing set shape: (8789, 544)
(35152,)


In [15]:
# Train the model
autoencoder.fit(X_train, X_train, 
                epochs=50, 
                batch_size=32, 
                validation_data=(X_test, X_test))


Epoch 1/50
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 0.0276 - val_loss: 0.0016
Epoch 2/50
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0015 - val_loss: 0.0012
Epoch 3/50
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.0011 - val_loss: 9.0451e-04
Epoch 4/50
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 8.5958e-04 - val_loss: 7.2219e-04
Epoch 5/50
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 6.7642e-04 - val_loss: 5.8153e-04
Epoch 6/50
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 5.3604e-04 - val_loss: 4.6489e-04
Epoch 7/50
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 4.3678e-04 - val_loss: 4.0021e-04
Epoch 8/50
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 3.8198e-04 - v

<keras.src.callbacks.history.History at 0x1f44111c810>

In [20]:
# 6. Create Encoder Model
encoder = models.Model(inputs=input_layer, outputs=latent)

# 7. Extract Deep Features
deep_features = encoder.predict(X)

print("Deep Features shape:", deep_features.shape)

[1m1374/1374[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
Deep Features shape: (43941, 64)


In [None]:
from tensorflow.keras import layers, models

# Reshape deep features for CNN
X_reshaped = deep_features.reshape(deep_features.shape[0], 64, 1)

# Input: Deep Features
input_layer = layers.Input(shape=(64, 1))  # 64 deep features

# Elastic CNN
x = layers.Conv1D(filters=32, kernel_size=3, activation='relu', padding='same')(input_layer)
x = layers.MaxPooling1D(pool_size=2)(x)

x = layers.Conv1D(filters=64, kernel_size=5, activation='relu', padding='same')(x)
x = layers.MaxPooling1D(pool_size=2)(x)

x = layers.Flatten()(x)
x = layers.Dense(64, activation='relu')(x)
output_layer = layers.Dense(1, activation='sigmoid')(x)

# Build the model
elastic_cnn_model = models.Model(inputs=input_layer, outputs=output_layer)

# Compile
elastic_cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])



# Train
elastic_cnn_model.fit(X_reshaped, y, epochs=30, batch_size=32, validation_split=0.2)




Epoch 1/30
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 8ms/step - accuracy: 0.9347 - loss: 0.1555 - val_accuracy: 0.9821 - val_loss: 0.0539
Epoch 2/30
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.9917 - loss: 0.0292 - val_accuracy: 0.9965 - val_loss: 0.0120
Epoch 3/30
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.9938 - loss: 0.0211 - val_accuracy: 0.9895 - val_loss: 0.0371
Epoch 4/30
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.9947 - loss: 0.0188 - val_accuracy: 0.9926 - val_loss: 0.0280
Epoch 5/30
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.9957 - loss: 0.0169 - val_accuracy: 0.9925 - val_loss: 0.0261
Epoch 6/30
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.9966 - loss: 0.0139 - val_accuracy: 0.9964 - val_loss: 0.0112
Epoch 7/30
[1m

<keras.src.callbacks.history.History at 0x1f42cf7b910>

In [22]:
elastic_cnn_model.summary()

In [16]:
# Option | Pipeline | Comments
# 1. End-to-End DL only | SQL ➔ AE ➔ Elastic CNN ➔ Dense ➔ Attack/Benign | Simple, fast, pure deep learning
# 2. Feature Extract ➔ XGBoost | SQL ➔ AE ➔ Elastic CNN ➔ Feature vector ➔ XGBoost ➔ Attack/Benign | Often better performance (XGBoost is very strong!)

In [23]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Train/Test split for XGBoost
X_train_xgb, X_test_xgb, y_train_xgb, y_test_xgb = train_test_split(deep_features, y, test_size=0.2, random_state=42)

# XGBoost Model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_xgb, y_train_xgb)

# Predict
y_pred_xgb = xgb_model.predict(X_test_xgb)

# Evaluation
print(f"XGBoost Accuracy: {accuracy_score(y_test_xgb, y_pred_xgb):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test_xgb, y_pred_xgb))
print("Classification Report:")
print(classification_report(y_test_xgb, y_pred_xgb))



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.9975
Confusion Matrix:
[[4151    2]
 [  20 4616]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4153
           1       1.00      1.00      1.00      4636

    accuracy                           1.00      8789
   macro avg       1.00      1.00      1.00      8789
weighted avg       1.00      1.00      1.00      8789



In [25]:
# Save Autoencoder
autoencoder.save('autoencoder.h5')

# Save Encoder
encoder.save('encoder_feature_extractor.h5')

# Save CNN
elastic_cnn_model.save('elastic_cnn.h5')

# Save XGBoost
import joblib
joblib.dump(xgb_model, 'xgboost_model.pkl')




['xgboost_model.pkl']

In [34]:
# Load Autoencoder with custom loss function
autoencoder = tf.keras.models.load_model('autoencoder.h5', compile=False)
autoencoder.compile(optimizer='adam', loss=tf.keras.losses.MeanSquaredError())

# Load Encoder Feature Extractor
feature_extractor = tf.keras.models.load_model('encoder_feature_extractor.h5', compile=False)

# Load TfidfVectorizer
vectorizer = joblib.load('tfidf_vectorizer.pkl')  # Ensure you have saved the vectorizer as 'tfidf_vectorizer.pkl'

# Load XGBoost model
xgb_model = joblib.load('xgboost_model.pkl')

In [35]:

def text_to_vector(text):
    # No fit_transform here — model is already trained
    vector = vectorizer.transform([text]).toarray()
    #vector should have 544 features (or whatever your model expects)
    print("Input vector shape:", vector.shape)
    
    return vector

In [36]:
def is_sql_injection(input_text):
    # Vectorize
    input_vector = text_to_vector(input_text)  # shape (1, 544)

    # Extract Deep Features
    deep_feature = feature_extractor.predict(input_vector)  # output shape (1, 64)

    # Predict using XGBoost
    pred = xgb_model.predict(deep_feature)
    
    return pred[0] == 1

In [44]:
is_sql_injection("admin' OR '1'='1' --")  # Example SQL Injection

Input vector shape: (1, 544)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step


True