# Loding Dataset & Spliting the dataset

In [1]:
import pandas as pd

# Load the data
df = pd.read_csv('../NoteBooks/cleaned_data.csv')
df.head()

Unnamed: 0,Id,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,IncidentGrade,EvidenceRole,DeviceId,Sha256,...,Category_Impact,Category_InitialAccess,Category_Other,Category_SuspiciousActivity,EntityType_CloudLogonRequest,EntityType_Ip,EntityType_MailMessage,EntityType_Mailbox,EntityType_Other,EntityType_User
0,455266534868,88,326,210035,58,43,0,0,98799,138268,...,False,False,False,False,False,False,False,False,False,True
1,1056561957389,809,58352,712507,423,298,0,1,98799,138268,...,False,True,False,False,False,False,False,False,True,False
2,214748368522,148,4359,188041,9,74,2,0,98799,138268,...,False,False,True,False,False,False,False,False,False,True
3,1073741827836,72,70,831157,4,3,2,0,98799,138268,...,False,True,False,False,False,False,False,False,False,True
4,223338299440,6,2472,1148,17,284,0,1,98799,138268,...,True,False,False,False,False,True,False,False,False,False


In [2]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['IncidentGrade'])  # Features
y = df['IncidentGrade']                 # Target variable   

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Check the shape of the splits
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_test.shape[0]}")

# Optional: Verify class distribution (use only if stratify=y is set)
print("\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in validation set:")
print(y_test.value_counts(normalize=True))

Training set size: 3437166
Validation set size: 1473072

Class distribution in training set:
IncidentGrade
2    0.403098
1    0.388515
0    0.208387
Name: proportion, dtype: float64

Class distribution in validation set:
IncidentGrade
2    0.403098
1    0.388515
0    0.208387
Name: proportion, dtype: float64


In [3]:
# Reducing the dataset size
sample_fraction = 0.05  # Adjust as needed (e.g., 10% of the original dataset)
df_sampled, _ = train_test_split(
    df, 
    stratify=df['IncidentGrade'], 
    test_size=1-sample_fraction, 
    random_state=42
)

# Separate features and target
X = df_sampled.drop(columns=['IncidentGrade'])
y = df_sampled['IncidentGrade']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Check the shape of the splits
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_test.shape[0]}")

# Optional: Verify class distribution (use only if stratify=y is set)
print("\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in validation set:")
print(y_test.value_counts(normalize=True))

Training set size: 171857
Validation set size: 73654

Class distribution in training set:
IncidentGrade
2    0.403097
1    0.388515
0    0.208388
Name: proportion, dtype: float64

Class distribution in validation set:
IncidentGrade
2    0.403101
1    0.388519
0    0.208380
Name: proportion, dtype: float64


# Balancing Target Class

In [None]:
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd

# Function to apply balancing in chunks
def balance_data(X, y, chunk_size=50000, random_state=42):
    balanced_chunks = []  # To store balanced chunks
    smote = SMOTE(random_state=random_state, n_jobs=-1)  # Initialize SMOTE
    smote_enn = SMOTEENN(random_state=random_state, n_jobs=-1)  # Initialize SMOTE-ENN

    for i in range(0, len(X), chunk_size):  # Process data in chunks
        X_chunk, y_chunk = X[i:i + chunk_size], y[i:i + chunk_size]

        # Step 1: Apply SMOTE to oversample minority classes
        X_smote, y_smote = smote.fit_resample(X_chunk, y_chunk)

        # Step 2: Apply SMOTEENN to clean noisy samples and further balance
        X_res, y_res = smote_enn.fit_resample(X_smote, y_smote)

        # Store balanced chunk
        balanced_chunks.append((pd.DataFrame(X_res), pd.Series(y_res)))

    # Combine all balanced chunks into a single dataset
    X_resampled = pd.concat([chunk[0] for chunk in balanced_chunks], ignore_index=True)
    y_resampled = pd.concat([chunk[1] for chunk in balanced_chunks], ignore_index=True)

    return X_resampled, y_resampled

# Applying the function to the dataset
X_resampled, y_resampled = balance_data(X_train, y_train, chunk_size=50000)

In [5]:
# Print class distributions before and after
print("Original class distribution:", Counter(y_train))
print("Class distribution after undersampling and SMOTEENN:", Counter(y_resampled))

Original class distribution: Counter({2: 69275, 1: 66769, 0: 35813})
Class distribution after undersampling and SMOTEENN: Counter({2: 41238, 0: 36672, 1: 20121})


In [None]:
X_resampled_test, y_resampled_test = balance_data(X_test, y_test, chunk_size=50000)

In [7]:
# Print class distributions before and after
print("Original class distribution:", Counter(y_test))
print("Class distribution after undersampling and SMOTEENN:", Counter(y_resampled_test))

Original class distribution: Counter({2: 29690, 1: 28616, 0: 15348})
Class distribution after undersampling and SMOTEENN: Counter({2: 17403, 0: 15360, 1: 8315})


# Scaling Numerical Columns

In [9]:
# import scaling 4
from sklearn.preprocessing import StandardScaler

# Initialize the scalers
scaler = StandardScaler()

# Define numerical features
numerical_features = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'DetectorId', 'AlertTitle', 'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn', 'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId', 'ApplicationId', 'ApplicationName', 'FileName', 'FolderPath', 'ResourceIdName', 'CountryCode', 'State', 'City']

# Fit the scaler on the training data and transform both train and test datasets
X_resampled = scaler.fit_transform(X_resampled[numerical_features])  # Compute and apply scaling on training data
X_resampled_test = scaler.transform(X_resampled_test[numerical_features])       # Apply the same transformation on test data


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

# Model Selection & Traning 

Building Model with cross validation

In [None]:
import xgboost as xgb
import lightgbm as lgm
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import (
    accuracy_score, classification_report, f1_score, 
    precision_score, recall_score, confusion_matrix
)

# Define models
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
    }

# Cross-validation setup
cv = 5
kf = KFold(n_splits=cv, shuffle=True, random_state=42)

# Train and evaluate models
for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Perform cross-validation predictions
    y_pred_cv = cross_val_predict(model, X_resampled, y_resampled, cv=kf)
    
    # Train on full training data
    model.fit(X_resampled, y_resampled)
    
    # Predictions on test data
    y_val_pred = model.predict(X_resampled_test)
    
    # Evaluation metrics on test data
    f1 = f1_score(y_resampled_test, y_val_pred, average="macro")
    precision = precision_score(y_resampled_test, y_val_pred, average="macro")
    recall = recall_score(y_resampled_test, y_val_pred, average="macro")
    accuracy = accuracy_score(y_resampled_test, y_val_pred)
    
    print(f"{model_name} Performance on Test Set:")
    print(f"Macro-F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_resampled_test, y_val_pred))
    
    # Confusion matrix and analysis on cross-validation predictions
    cm = confusion_matrix(y_resampled, y_pred_cv)
    print("Confusion Matrix (Cross-Validation):")
    print(cm)
    
    for i, class_label in enumerate(np.unique(y_resampled)):
        tp = cm[i, i]
        fp = cm[:, i].sum() - tp
        fn = cm[i, :].sum() - tp
        tn = cm.sum() - (tp + fp + fn)
        print(f"Class {class_label}: TP={tp}, FP={fp}, FN={fn}, TN={tn}")
    
    print("-" * 50)

Training Logistic Regression...
Logistic Regression Performance on Test Set:
Macro-F1 Score: 0.6531
Precision: 0.7168
Recall: 0.6542
Accuracy: 0.7312
Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.83      0.72     15360
           1       0.67      0.27      0.38      8315
           2       0.84      0.87      0.85     17403

    accuracy                           0.73     41078
   macro avg       0.72      0.65      0.65     41078
weighted avg       0.73      0.73      0.71     41078

Confusion Matrix (Cross-Validation):
[[30219  2510  3943]
 [11827  5280  3014]
 [ 5328   328 35582]]
Class 0: TP=30219, FP=17155, FN=6453, TN=44204
Class 1: TP=5280, FP=2838, FN=14841, TN=75072
Class 2: TP=35582, FP=6957, FN=5656, TN=49836
--------------------------------------------------
Training Decision Tree...
Decision Tree Performance on Test Set:
Macro-F1 Score: 0.9404
Precision: 0.9383
Recall: 0.9430
Accuracy: 0.9488
Classification Re

In [16]:
# Xgboost
model = xgb.XGBClassifier(random_state=42, n_jobs=-1)
model.fit(X_resampled, y_resampled)
y_val_pred = model.predict(X_resampled_test)

# Cross-validation setup
cv = 5
kf = KFold(n_splits=cv, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(model, X_resampled, y_resampled, cv=kf)   # perform cross-validation predictions

# Confusion matrix and analysis on cross-validation predictions
cm = confusion_matrix(y_resampled, y_pred_cv)

In [17]:
print("XGBoost Performance on Test Set:")
print(f"Macro-F1 Score: {f1_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
print(f"Precision: {precision_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
print(f"Recall: {recall_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
print(f"Accuracy: {accuracy_score(y_resampled_test, y_val_pred):.4f}")
print(f"Confusion Matrix (Cross-Validation): {cm}")
print("Classification Report:")
print(classification_report(y_resampled_test, y_val_pred))

XGBoost Performance on Test Set:
Macro-F1 Score: 0.9553
Precision: 0.9532
Recall: 0.9575
Accuracy: 0.9621
Confusion Matrix (Cross-Validation): [[35590   942   140]
 [ 1047 18931   143]
 [  321   408 40509]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96     15360
           1       0.91      0.94      0.92      8315
           2       0.99      0.98      0.99     17403

    accuracy                           0.96     41078
   macro avg       0.95      0.96      0.96     41078
weighted avg       0.96      0.96      0.96     41078



In [None]:
# LightGBM
model = lgm.LGBMClassifier(random_state=42, n_jobs=-1)
model.fit(X_resampled, y_resampled)
y_val_pred = model.predict(X_resampled_test)

# Cross-validation setup
cv = 5
kf = KFold(n_splits=cv, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(model, X_resampled, y_resampled, cv=kf)   # perform cross-validation predictions

# Confusion matrix and analysis on cross-validation predictions
cm = confusion_matrix(y_resampled, y_pred_cv)

In [21]:
print("LightGBM Performance on Test Set:")
print(f"Macro-F1 Score: {f1_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
print(f"Precision: {precision_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
print(f"Recall: {recall_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
print(f"Accuracy: {accuracy_score(y_resampled_test, y_val_pred):.4f}")
print(f"Confusion Matrix (Cross-Validation): \n{cm}")
print("\nClassification Report:")
print(classification_report(y_resampled_test, y_val_pred))

LightGBM Performance on Test Set:
Macro-F1 Score: 0.9497
Precision: 0.9475
Recall: 0.9522
Accuracy: 0.9573
Confusion Matrix (Cross-Validation): 
[[35325  1212   135]
 [ 1297 18680   144]
 [  387   470 40381]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     15360
           1       0.90      0.93      0.91      8315
           2       0.99      0.98      0.98     17403

    accuracy                           0.96     41078
   macro avg       0.95      0.95      0.95     41078
weighted avg       0.96      0.96      0.96     41078



# Model Evaluation and Tuning

1. Hyperparameter Tuning with RandomizedSearchCV (for Random Forest)

In [23]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Define the model and parameters for tuning
rf = RandomForestClassifier(random_state=42)
param_dist = {
    "n_estimators": [50, 100, 200],
    "max_depth": [10, 20, 30],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2],
    "bootstrap": [True, False]
}

# Use StratifiedKFold for balanced splits
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Perform RandomizedSearchCV
rf_random = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=param_dist, 
    n_iter=30,  # Reduced iterations
    cv=cv,  # Stratified cross-validation
    verbose=1, 
    random_state=42, 
    n_jobs=-1
)
rf_random.fit(X_resampled, y_resampled)

# Best parameters
print("Best Parameters:", rf_random.best_params_)


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30, 'bootstrap': False}


After Hypertuning Random Forest

In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Define the model and parameters for tuning
rf = RandomForestClassifier(random_state=42)
param_dist = {
    "n_estimators": [200],
    "max_depth": [30],
    "min_samples_split": [2],
    "min_samples_leaf": [1],
    "bootstrap": [False]
}

# Use StratifiedKFold for balanced splits
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Perform RandomizedSearchCV
rf_random = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=param_dist, 
    n_iter=30,  # Reduced iterations
    cv=cv,  # Stratified cross-validation
    verbose=1, 
    random_state=42, 
    n_jobs=-1
)
rf_random.fit(X_resampled, y_resampled)
y_val_pred = rf_random.predict(X_resampled_test)

In [31]:
print("Random Forest Performance on Test Set:")
print(f"Macro-F1 Score: {f1_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
print(f"Precision: {precision_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
print(f"Recall: {recall_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
print(f"Accuracy: {accuracy_score(y_resampled_test, y_val_pred):.4f}")
print("Classification Report:")
print(classification_report(y_resampled_test, y_val_pred))

Random Forest Performance on Test Set:
Macro-F1 Score: 0.9602
Precision: 0.9572
Recall: 0.9635
Accuracy: 0.9662
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96     15360
           1       0.91      0.95      0.93      8315
           2       0.99      0.98      0.99     17403

    accuracy                           0.97     41078
   macro avg       0.96      0.96      0.96     41078
weighted avg       0.97      0.97      0.97     41078



In [32]:
# Saving the model in pickle file
import pickle

# Save the model
with open('model.pkl', 'wb') as model_file:
    pickle.dump(rf_random, model_file)

# Save the scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)