# Loding Dataset & Spliting the dataset

In [1]:
import pandas as pd

# Load the data
df = pd.read_csv('../Resources/cleaned_data.csv')
df.head()

Unnamed: 0,IncidentGrade,Year,Month,Day,Hour,Minute,Second,EvidenceRole_Others,EvidenceRole_Related,DeviceId_98799,...,CountryCode_242,CountryCode_Others,State_0,State_1,State_1445,State_Others,City_0,City_1,City_10630,City_Others
0,1,2024,6,4,6,5,15,False,True,True,...,False,True,False,False,False,True,False,False,False,True
1,2,2024,6,14,3,1,25,True,False,True,...,True,False,False,False,True,False,False,False,True,False
2,2,2024,6,13,4,52,55,False,True,True,...,True,False,False,False,True,False,False,False,True,False
3,0,2024,6,10,16,39,36,False,True,True,...,True,False,False,False,True,False,False,False,True,False
4,1,2024,6,15,1,8,7,True,False,True,...,True,False,False,False,True,False,False,False,True,False


Dropping unnecessary columns which are not present in the test set

In [2]:
df.drop(columns=['Id_1', 'Id_1751', 'Id_1783', 'Id_Others', 'IncidentId_2', 'IncidentId_7', 'IncidentId_9', 'IncidentId_Others', 'AlertId_0', 'AlertId_2', 'AlertId_3', 'AlertId_Others', 'Url_0', 'Url_1', 'Url_160396', 'Url_Others'], inplace=True)

In [3]:
df.shape

(8922805, 89)

In [4]:
from sklearn.model_selection import train_test_split 

# Reducing the dataset size
sample_fraction = 0.10  # Use only 10% of the dataset
df_sampled, _ = train_test_split(
    df, 
    stratify=df['IncidentGrade'], 
    test_size=1-sample_fraction, 
    random_state=42
)

# Separate features and target
X = df_sampled.drop(columns=['IncidentGrade'])
y = df_sampled['IncidentGrade']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Check the shape of the splits
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_test.shape[0]}")

# Optional: Verify class distribution (use only if stratify=y is set)
print("\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in validation set:")
print(y_test.value_counts(normalize=True))

Training set size: 624596
Validation set size: 267684

Class distribution in training set:
IncidentGrade
0    0.428991
1    0.351920
2    0.219089
Name: proportion, dtype: float64

Class distribution in validation set:
IncidentGrade
0    0.428991
1    0.351922
2    0.219087
Name: proportion, dtype: float64


# Balancing Target Class

In [5]:
from collections import Counter
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler

# Function to apply balancing in chunks
def balance_data(X, y, chunk_size):
    balanced_chunks = []  # To store balanced chunks
    rus = RandomUnderSampler(random_state=42)
    smote_enn = SMOTEENN(random_state=42, n_jobs=-1)

    for i in range(0, len(X), chunk_size):  # Process data in chunks
        X_chunk, y_chunk = X[i:i + chunk_size], y[i:i + chunk_size]

        # Step 1: Apply undersampling
        X_rus, y_rus = rus.fit_resample(X_chunk, y_chunk)

        # Step 2: Apply SMOTEENN to clean noisy samples and further balance
        X_res, y_res = smote_enn.fit_resample(X_rus, y_rus)

        # Store balanced chunk
        balanced_chunks.append((pd.DataFrame(X_res), pd.Series(y_res)))

    # Combine all balanced chunks into a single dataset
    X_resampled = pd.concat([chunk[0] for chunk in balanced_chunks], ignore_index=True)
    y_resampled = pd.concat([chunk[1] for chunk in balanced_chunks], ignore_index=True)

    return X_resampled, y_resampled

# Applying the function to train dataset
X_resampled, y_resampled = balance_data(X_train, y_train, chunk_size=50000)

In [6]:
# Print class distributions before and after
print("Original class distribution:", Counter(y_train))
print("Class distribution after undersampling and SMOTEENN:", Counter(y_resampled))

Original class distribution: Counter({0: 267946, 1: 219808, 2: 136842})
Class distribution after undersampling and SMOTEENN: Counter({2: 29802, 1: 24719, 0: 16925})


In [7]:
# Applying the function to train dataset
X_resampled_test, y_resampled_test = balance_data(X_test, y_test, chunk_size=50000)

In [8]:
# Print class distributions before and after
print("Original class distribution:", Counter(y_test))
print("Class distribution after undersampling and SMOTEENN:", Counter(y_resampled_test))

Original class distribution: Counter({0: 114834, 1: 94204, 2: 58646})
Class distribution after undersampling and SMOTEENN: Counter({2: 12723, 1: 10603, 0: 7036})


# Model Selection & Traning 

Building Model with cross validation

In [9]:
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import (
    accuracy_score, classification_report, f1_score,
    precision_score, recall_score, confusion_matrix)

def evaluate_and_predict(model, X_train, y_train, X_test, y_test):
    
    # Train on full training data
    model.fit(X_train, y_train)
    # Predict on test data
    y_val_pred = model.predict(X_test)
    
    return y_val_pred

In [10]:
def model_performance(model_name, y_resampled_test, y_val_pred):
    print(f"{model_name} Performance on Test Set:")
    print(f"Macro-F1 Score: {f1_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
    print(f"Precision: {precision_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
    print(f"Recall: {recall_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
    print(f"Accuracy: {accuracy_score(y_resampled_test, y_val_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_resampled_test, y_val_pred))

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42, max_iter=2000)
y_val_pred = evaluate_and_predict(lr, X_resampled, y_resampled, X_resampled_test, y_resampled_test)

# model_performance('Logistic Regression', y_resampled_test, y_val_pred)

In [12]:
model_performance('Logistic Regression', y_resampled_test, y_val_pred)

Logistic Regression Performance on Test Set:
Macro-F1 Score: 0.6795
Precision: 0.7153
Recall: 0.6667
Accuracy: 0.6908
Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.55      0.60      7036
           1       0.86      0.64      0.73     10603
           2       0.62      0.81      0.70     12723

    accuracy                           0.69     30362
   macro avg       0.72      0.67      0.68     30362
weighted avg       0.71      0.69      0.69     30362



In [13]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
y_val_pred = evaluate_and_predict(dt, X_resampled, y_resampled, X_resampled_test, y_resampled_test)

model_performance('Decision Tree', y_resampled_test, y_val_pred)

Decision Tree Performance on Test Set:
Macro-F1 Score: 0.8741
Precision: 0.8741
Recall: 0.8741
Accuracy: 0.8825
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.81      0.81      7036
           1       0.92      0.91      0.92     10603
           2       0.89      0.89      0.89     12723

    accuracy                           0.88     30362
   macro avg       0.87      0.87      0.87     30362
weighted avg       0.88      0.88      0.88     30362



In [14]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42, n_estimators=100)
y_val_pred = evaluate_and_predict(rf, X_resampled, y_resampled, X_resampled_test, y_resampled_test)

model_performance('RandomForestClassifier', y_resampled_test, y_val_pred)

RandomForestClassifier Performance on Test Set:
Macro-F1 Score: 0.8969
Precision: 0.8942
Recall: 0.9002
Accuracy: 0.9036
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.87      0.84      7036
           1       0.94      0.93      0.94     10603
           2       0.92      0.90      0.91     12723

    accuracy                           0.90     30362
   macro avg       0.89      0.90      0.90     30362
weighted avg       0.91      0.90      0.90     30362



In [None]:
import xgboost as xgb

xg = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
y_val_pred = evaluate_and_predict(xg, X_resampled, y_resampled, X_resampled_test, y_resampled_test)

In [16]:
model_performance('Xgboost', y_resampled_test, y_val_pred)

Xgboost Performance on Test Set:
Macro-F1 Score: 0.8799
Precision: 0.8793
Recall: 0.8807
Accuracy: 0.8879
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.83      0.82      7036
           1       0.93      0.92      0.93     10603
           2       0.89      0.90      0.90     12723

    accuracy                           0.89     30362
   macro avg       0.88      0.88      0.88     30362
weighted avg       0.89      0.89      0.89     30362



In [None]:
import lightgbm as lgm

lightgbm =  lgm.LGBMClassifier(random_state=42)
y_val_pred = evaluate_and_predict(xg, X_resampled, y_resampled, X_resampled_test, y_resampled_test)

In [18]:
model_performance('lightgbm', y_resampled_test, y_val_pred)

lightgbm Performance on Test Set:
Macro-F1 Score: 0.8799
Precision: 0.8793
Recall: 0.8807
Accuracy: 0.8879
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.83      0.82      7036
           1       0.93      0.92      0.93     10603
           2       0.89      0.90      0.90     12723

    accuracy                           0.89     30362
   macro avg       0.88      0.88      0.88     30362
weighted avg       0.89      0.89      0.89     30362



# Model Evaluation and Tuning

1. Hyperparameter Tuning with RandomizedSearchCV (for Random Forest)

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV


# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],         # Number of trees
    'max_features': ['sqrt', 'log2'],        # Features considered for splitting
    'max_depth': [10, 20, None],             # Depth of trees
    'min_samples_split': [2, 5],             # Minimum samples to split a node
    'min_samples_leaf': [1, 2],              # Minimum samples at a leaf node
    'bootstrap': [True]                      # Use of bootstrap samples
}

# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid,
    n_iter=50,           # Number of iterations to search
    scoring='accuracy',  # Evaluation metric
    cv=3,                # Cross-validation folds
    verbose=1,
    random_state=42,
    n_jobs=-1            # Use all available cores
)

# Fit RandomizedSearchCV
random_search.fit(X_resampled, y_resampled)

# Best parameters and score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

After Hypertuning Random Forest

In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

# Define the model and parameters for tuning
rf = RandomForestClassifier(random_state=42)
param_dist = {
    "n_estimators": [200],
    'max_features': ['log2'],
    "max_depth": [None],
    "min_samples_split": [5],
    "min_samples_leaf": [1],
    "bootstrap": [True]
}

# Initialize RandomizedSearchCV
rf_best_model = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,           # Number of iterations to search
    scoring='accuracy',  # Evaluation metric
    verbose=1,
    random_state=42,
    n_jobs=-1            # Use all available cores
)

rf_best_model.fit(X_resampled, y_resampled)
y_val_pred = rf_best_model.predict(X_resampled_test)

In [21]:
print("Random Forest Performance after hypertuning on Test Set:")
print(f"Macro-F1 Score: {f1_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
print(f"Precision: {precision_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
print(f"Recall: {recall_score(y_resampled_test, y_val_pred, average='macro'):.4f}")
print(f"Accuracy: {accuracy_score(y_resampled_test, y_val_pred):.4f}")
print("Classification Report:")
print(classification_report(y_resampled_test, y_val_pred))

Random Forest Performance after hypertuning on Test Set:
Macro-F1 Score: 0.8979
Precision: 0.8950
Recall: 0.9017
Accuracy: 0.9043
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.88      0.85      7036
           1       0.94      0.93      0.94     10603
           2       0.92      0.90      0.91     12723

    accuracy                           0.90     30362
   macro avg       0.89      0.90      0.90     30362
weighted avg       0.91      0.90      0.90     30362



In [22]:
# Saving the model in pickle file
import pickle

# Save the model
with open('../Resources/best_model.pkl', 'wb') as model_file:
    pickle.dump(rf_best_model, model_file)