In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.sparse import hstack
import joblib


print("[1/5] Loading dataset...")

DATASET_PATH = '/content/Modified_SQL_Dataset.csv'
df = pd.read_csv(DATASET_PATH)

print(f"âœ“ Dataset loaded: {len(df)} samples")
print(f"  Normal: {(df['Label']==0).sum()}")
print(f"  Malicious: {(df['Label']==1).sum()}")

print("\n[2/5] Extracting features...")

df['length'] = df['Query'].apply(len)
df['special_chars'] = df['Query'].apply(lambda x: sum(1 for c in x if not c.isalnum() and not c.isspace()))
df['single_quotes'] = df['Query'].apply(lambda x: x.count("'"))
df['equals_count'] = df['Query'].apply(lambda x: x.count('='))
df['dash_count'] = df['Query'].apply(lambda x: x.count('-'))

print("âœ“ Features extracted")

print("\n[3/5] Splitting data...")

X = df['Query']
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"âœ“ Train: {len(X_train)} | Test: {len(X_test)}")

print("\n[4/5] Creating TF-IDF features...")

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 3),
    analyzer='char',
    lowercase=True
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

numerical_features = df[['length', 'special_chars', 'single_quotes',
                          'equals_count', 'dash_count']].values

X_train_numerical = numerical_features[X_train.index]
X_test_numerical = numerical_features[X_test.index]

X_train_combined = hstack([X_train_vec, X_train_numerical])
X_test_combined = hstack([X_test_vec, X_test_numerical])

print("âœ“ Features combined")

print("\n[5/5] Training model...")

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=30,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

model.fit(X_train_combined, y_train)
y_pred = model.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_pred)

print("âœ“ Training completed!")

print(f"\n{'='*60}")
print("EVALUATION RESULTS")
print('='*60)
print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Normal', 'Malicious']))
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"\n{'='*60}")
print("SAVING MODEL")
print('='*60)

model_package = {
    'model': model,
    'vectorizer': vectorizer,
    'version': '1.0',
    'accuracy': accuracy
}

joblib.dump(model_package, 'sqli_detector.pkl')

print("Model saved: sqli_detector.pkl")
print("TRAINING COMPLETED SUCCESSFULLY!")
print(f"\nModel Accuracy: {accuracy*100:.2f}%")
print(f"Saved file: sqli_detector.pkl")

[1/5] Loading dataset...
âœ“ Dataset loaded: 30919 samples
  Normal: 19537
  Malicious: 11382

[2/5] Extracting features...
âœ“ Features extracted

[3/5] Splitting data...
âœ“ Train: 24735 | Test: 6184

[4/5] Creating TF-IDF features...
âœ“ Features combined

[5/5] Training model...
âœ“ Training completed!

EVALUATION RESULTS

Accuracy: 0.9956 (99.56%)

Classification Report:
              precision    recall  f1-score   support

      Normal       0.99      1.00      1.00      3908
   Malicious       1.00      0.99      0.99      2276

    accuracy                           1.00      6184
   macro avg       1.00      0.99      1.00      6184
weighted avg       1.00      1.00      1.00      6184


Confusion Matrix:
[[3908    0]
 [  27 2249]]

SAVING MODEL
Model saved: sqli_detector.pkl
TRAINING COMPLETED SUCCESSFULLY!

Model Accuracy: 99.56%
Saved file: sqli_detector.pkl


In [9]:
import joblib
import numpy as np
from scipy.sparse import hstack
import sys

try:
    model_package = joblib.load('sqli_detector.pkl')
    model = model_package['model']
    vectorizer = model_package['vectorizer']
    print(f"Model loaded (Accuracy: {model_package['accuracy']*100:.2f}%)\n")
except FileNotFoundError:
    print("ERROR: Model file 'sqli_detector.pkl' not found!")
    sys.exit(1)

def detect(query):
    query_vec = vectorizer.transform([query])

    features = [[
        len(query),
        sum(1 for c in query if not c.isalnum() and not c.isspace()),
        query.count("'"),
        query.count('='),
        query.count('-')
    ]]

    query_combined = hstack([query_vec, features])

    prediction = model.predict(query_combined)[0]
    proba = model.predict_proba(query_combined)[0]

    return {
        'is_malicious': bool(prediction == 1),
        'score': proba[1],
        'safe_score': proba[0]
    }


print("SQL Injection Detector")
print("Enter queries to test (type 'exit' to quit)\n")

while True:
    try:
        query = input("Query: ").strip()

        if query.lower() in ['exit', 'quit', 'q']:
            print("Goodbye!")
            break

        if not query:
            continue

        result = detect(query)

        if result['is_malicious']:
            print(f"  ðŸš¨ MALICIOUS ({result['score']*100:.2f}%)")
        else:
            print(f"  âœ… SAFE ({result['safe_score']*100:.2f}%)")
        print()

    except KeyboardInterrupt:
        print("\n\nGoodbye!")
        break
    except Exception as e:
        print(f"Error: {e}\n")

Model loaded (Accuracy: 99.56%)

SQL Injection Detector
Enter queries to test (type 'exit' to quit)

Query: ' oR 1=1 --
  ðŸš¨ MALICIOUS (98.99%)



Goodbye!
