In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score

# load the data

In [None]:
df=pd.read_csv('./dataset/sqliv2.csv',encoding='utf-16')

# EDA for the dataset

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.isna().sum()

In [None]:
df[df['Sentence'].isna()]

In [None]:
df.dropna(inplace=True)

In [None]:
df.reset_index(drop=True,inplace=True)

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True,inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.info()

# check the classses distribution

In [None]:
class_distribution=pd.DataFrame(df['Label'].value_counts())

In [None]:
class_distribution

# Bar plot that shows the classes distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# Suppress warnings
warnings.filterwarnings("ignore")
# Customized bar plot
plt.figure(figsize=(6, 6))
ax = sns.barplot(x='Label', y='count', data=class_distribution, palette=['blue', 'red'])

# Rename the x-axis labels
plt.xticks(ticks=[0, 1], labels=['Clean', 'SQLi'])

# Add annotations
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center',
                xytext=(0, 5),
                textcoords='offset points')

plt.xlabel('Label')
plt.title('Count of Labels')
plt.ylabel('Count')
plt.show()


# Plot to check if the data is labels is ordered

In [None]:
import matplotlib.pyplot as plt

# Plot the labels
plt.figure(figsize=(10, 4))
plt.plot(df['Label'], marker='o', linestyle='', color='b')
plt.title('Sequence of Labels')
plt.xlabel('Row Index')
plt.ylabel('Label')
plt.yticks([0, 1], ["Clean", "SQLi"])
plt.grid(True)
plt.show()


In [None]:
df['Label']

# Data Augmentation

In [None]:
df2=pd.read_csv('./dataset/payload_full.csv')


In [None]:
df2

In [None]:
df2.shape

In [None]:
df2.head()

In [None]:
del df2['length']
del df2['label']

In [None]:
df2.head()

In [None]:
df2['attack_type'].unique()

In [None]:
df2.columns

In [None]:
df2.columns=['Sentence','Label']

In [None]:
df2['Label'].value_counts()

In [None]:
df2=df2[df2['Label']=='sqli']

In [None]:
df2.isna().sum()

In [None]:
df2.duplicated().sum()

### Change the label from SQLi to 1, to match the first dataset structure

In [None]:
df2['Label']=1

In [None]:
df2['Label'].value_counts()

In [None]:
df2.info()

In [None]:
df=pd.concat([df,df2])
df.reset_index(drop=True,inplace=True)

In [None]:
df.shape

In [None]:
after_redistribution=pd.DataFrame(df['Label'].value_counts())

In [None]:
after_redistribution

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
# Suppress warnings
warnings.filterwarnings("ignore")

# Customized bar plot
plt.figure(figsize=(6, 6))
ax = sns.barplot(x='Label', y='count', data=after_redistribution, palette=['blue', 'red'])

# Add annotations
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center',
                xytext=(0, 5),
                textcoords='offset points')
plt.xticks(ticks=[0, 1], labels=['Clean', 'SQLi'])
plt.title('Count of Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

In [None]:
# Plot the labels
plt.figure(figsize=(10, 4))
plt.plot(df['Label'], marker='o', linestyle='', color='b')
plt.title('Sequence of Labels')
plt.xlabel('Row Index')
plt.ylabel('Label')
plt.yticks([0, 1])
plt.grid(True)
plt.show()

In [None]:
df=df.sample(frac=1,random_state=42).reset_index(drop=True)

In [None]:
df.head()

# train test split

In [None]:
# split the dataset into training and testing sets
x=df['Sentence']
y=df['Label']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

# Grid Search to find best vectorizer parameters

In [None]:
df.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer



# initialize the tf-idf vectorizer

tfidf_vctorizer=TfidfVectorizer(
          max_features=5000,
    analyzer='word',
    token_pattern=r'(?u)\b\w\w+\b|--|[\"\';#\-+()/\*\*/]',
    max_df=0.9,
    min_df=2,
    norm='l2',
    ngram_range=(1, 2)
)


# fit and transform the training data
x_train_vectorized=tfidf_vctorizer.fit_transform(x_train)
x_test_vectorized=tfidf_vctorizer.fit_transform(x_test)

### Train RandomForestClassifier

In [None]:
# initialize and train the model
model=RandomForestClassifier(class_weight={0:1,1:5},random_state=42,min_samples_split=5,min_samples_leaf=2,n_jobs=-1)

model.fit(x_train_vectorized,y_train)

### Random Search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint,uniform

# define the parameter distributions for random serach
param_dist={
    'n_estimators':randint(100,300),  # Random integer between 100 and 300
    'max_depth': [None,10,20],    # Fixed choices
    'min_samples_split':randint(2,10),     # Random integer between 2 and 10
    'min_samples_leaf': randint(1,4),       # Random integer between 1 and 4
    'class_weight':[{0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 5}, 'balanced'] # Fixed choices    
}


# initialize the rand serach

rand_search=RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    scoring='recall',
    n_jobs=-1,
    random_state=42

)

# Fit Randomized Search
rand_search.fit(x_train_vectorized, y_train)

# Best parameters and recall score
print("Best Parameters:", rand_search.best_params_)
print("Best Recall Score:", rand_search.best_score_)

### Grid Search To find best class_weight

In [None]:
from sklearn.model_selection import GridSearchCV

# define the parameter grid for classweights
param_grid={
    'n_estimators':[100,200,300],
    'max_depth':[None,10,20],
    'min_samples_split':[2,5,10],
    'min_samples_leaf': [1,2,4],
    'class_weight': [{0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 5}, 'balanced']
    
}


# initlize the grid search
grid_search=GridSearchCV(

        estimator=model,
        param_grid=param_grid,
        cv=5,
        scoring='recall',
        n_jobs=-1
)
# Fit Grid Search
grid_search.fit(x_train_vectorized, y_train)

# Best parameters and recall score
print("Best Parameters:", grid_search.best_params_)
print("Best Recall Score:", grid_search.best_score_)