In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')


In [2]:
data = pd.read_csv(r'Modified_SQL_Dataset.csv')


In [3]:
print(data.shape)

(30919, 2)


In [4]:
print(data.columns)

Index(['Query', 'Label'], dtype='object')


In [5]:
print(data.dtypes)

Query    object
Label     int64
dtype: object


In [6]:
print(data.info)

<bound method DataFrame.info of                                                    Query  Label
0                      " or pg_sleep  (  __TIME__  )  --      1
1      create user name identified by pass123 tempora...      1
2       AND 1  =  utl_inaddr.get_host_address   (    ...      1
3       select * from users where id  =  '1' or @ @1 ...      1
4       select * from users where id  =  1 or 1#"  ( ...      1
...                                                  ...    ...
30914              DELETE FROM door WHERE grow = 'small'      0
30915                               DELETE FROM tomorrow      0
30916                       SELECT wide ( s )  FROM west      0
30917       SELECT * FROM  ( SELECT slide FROM breath )       0
30918                           SELECT TOP 3 * FROM race      0

[30919 rows x 2 columns]>


In [7]:
encoder = OneHotEncoder(handle_unknown='ignore')
X_encoded = encoder.fit_transform(data.drop('Label', axis=1))


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, data['Label'], test_size=0.2, random_state=42)


In [9]:
random_forest_model = RandomForestClassifier(n_estimators=70, random_state=42)
random_forest_model.fit(X_train, y_train)
y_pred_rf = random_forest_model.predict(X_test)

In [10]:
print(X_encoded)

  (0, 2952)	1.0
  (1, 26088)	1.0
  (2, 1955)	1.0
  (3, 2566)	1.0
  (4, 2787)	1.0
  (5, 2904)	1.0
  (6, 29552)	1.0
  (7, 12823)	1.0
  (8, 2575)	1.0
  (9, 2570)	1.0
  (10, 14970)	1.0
  (11, 1903)	1.0
  (12, 24396)	1.0
  (13, 2671)	1.0
  (14, 28375)	1.0
  (15, 1939)	1.0
  (16, 2535)	1.0
  (17, 2700)	1.0
  (18, 2745)	1.0
  (19, 26323)	1.0
  (20, 29511)	1.0
  (21, 6623)	1.0
  (22, 29573)	1.0
  (23, 24372)	1.0
  (24, 2699)	1.0
  :	:
  (30894, 18631)	1.0
  (30895, 16250)	1.0
  (30896, 16903)	1.0
  (30897, 17159)	1.0
  (30898, 17743)	1.0
  (30899, 16476)	1.0
  (30900, 19450)	1.0
  (30901, 17468)	1.0
  (30902, 16503)	1.0
  (30903, 17892)	1.0
  (30904, 19459)	1.0
  (30905, 1985)	1.0
  (30906, 2184)	1.0
  (30907, 15346)	1.0
  (30908, 15336)	1.0
  (30909, 617)	1.0
  (30910, 524)	1.0
  (30911, 1820)	1.0
  (30912, 2402)	1.0
  (30913, 18678)	1.0
  (30914, 15034)	1.0
  (30915, 15178)	1.0
  (30916, 24150)	1.0
  (30917, 15712)	1.0
  (30918, 22120)	1.0


In [11]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f_score_rf = f1_score(y_test, y_pred_rf, average='weighted')
classification_rep_rf = classification_report(y_test, y_pred_rf)


In [12]:
print(f'Random Forest - Accuracy: {accuracy_rf}')
print(f'Random Forest - Precision: {precision_rf}')
print(f'Random Forest - Recall: {recall_rf}')
print(f'Random Forest - F-score: {f_score_rf}')
print("----------")
print(f'Random Forest - Classification Report:\n{classification_rep_rf}')


Random Forest - Accuracy: 0.630012936610608
Random Forest - Precision: 0.7669698045987862
Random Forest - Recall: 0.630012936610608
Random Forest - F-score: 0.4875188635435504
----------
Random Forest - Classification Report:
              precision    recall  f1-score   support

           0       0.63      1.00      0.77      3893
           1       1.00      0.00      0.00      2291

    accuracy                           0.63      6184
   macro avg       0.81      0.50      0.39      6184
weighted avg       0.77      0.63      0.49      6184

