In [3]:
pip install lightgbm

Collecting lightgbmNote: you may need to restart the kernel to use updated packages.

  Downloading lightgbm-4.5.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.4 MB 487.6 kB/s eta 0:00:03
   - -------------------------------------- 0.1/1.4 MB 465.5 kB/s eta 0:00:03
   --- ------------------------------------ 0.1/1.4 MB 654.9 kB/s eta 0:00:03
   --- ------------------------------------ 0.1/1.4 MB 653.6 kB/s eta 0:00:03
   ------ --------------------------------- 0.2/1.4 MB 801.7 kB/s eta 0:00:02
   ------- -------------------------------- 0.3/1.4 MB 744.2 kB/s eta 0:00:02
   -------- ------------------------------- 0.3/1.4 MB 764.6 kB/s eta 0:00:02
   ---------- ----------------------------- 0.4/1.4 MB 849.3 kB/s eta 0:00:02
   ------------- -------

In [5]:
import joblib
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

In [9]:
# Loading the encoded train and test data
# Load the data into memory (no mmap_mode)
train_data = joblib.load('encoded_train_data1.joblib')
test_data = joblib.load('encoded_test_data1.joblib')


# Separateing features (X) and target variable (y)
X = train_data.drop(columns=['IncidentGrade'])
y = train_data['IncidentGrade']

# Spliting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### evaluating best model for training data

In [11]:
X_train_subsample = X_train.sample(frac=0.1, random_state=42)
y_train_subsample = y_train.loc[X_train_subsample.index]

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_jobs=-1, random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(n_jobs=-1, random_state=42),
    'LightGBM': LGBMClassifier(n_jobs=-1, random_state=42),
}

for model_name, model in models.items():
    print(f'Model: {model_name}')
    
    model.fit(X_train_subsample, y_train_subsample)
    
    y_pred = model.predict(X_val)
    
    # Evaluateing the models
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    
    # Displaying the results of the modles
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(report)
    print('Confusion Matrix:')
    print(cm)
    print('-' * 50)

Model: Logistic Regression
Accuracy: 0.5616696298351134
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.87      0.66    416110
           1       0.47      0.07      0.12    203156
           2       0.69      0.47      0.56    332418

    accuracy                           0.56    951684
   macro avg       0.56      0.47      0.44    951684
weighted avg       0.57      0.56      0.51    951684

Confusion Matrix:
[[363085   8387  44638]
 [161550  13874  27732]
 [167529   7316 157573]]
--------------------------------------------------
Model: Random Forest
Accuracy: 0.9336323821772773
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94    416110
           1       0.95      0.88      0.91    203156
           2       0.96      0.92      0.94    332418

    accuracy                           0.93    951684
   macro avg       0.94      0.92      0.93    951684

In [17]:
# Createing a report data
report = {
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'XGBoost', 'LightGBM', 'Gradient Boosting'],
    'Accuracy': [0.56,0.93,0.96,0.92,0.89,0.8],
    'Macro-F1 Score': [0.44,0.93,0.95,0.91,0.88,0.78],
    'Precision': [0.56,0.94,0.93,0.92,0.91,0.84],
    'Recall': [0.47,0.92,0.95,0.90,0.87,0.76]
}

df = pd.DataFrame(report)

print("Comparison Table:")
print(df.to_string(index=False))

best_models_with_max_f1 = df[df['Macro-F1 Score'] == df['Macro-F1 Score'].max()]

if len(best_models_with_max_f1) > 1:
    best_model = best_models_with_max_f1.loc[best_models_with_max_f1['Accuracy'].idxmax()]
else:
    best_model = df.loc[df['Macro-F1 Score'].idxmax()]

print("\nBest Model Based on Macro-F1 Score (and Accuracy in case of a tie):")
print(best_model)

Comparison Table:
              Model  Accuracy  Macro-F1 Score  Precision  Recall
Logistic Regression      0.56            0.44       0.56    0.47
      Decision Tree      0.93            0.93       0.94    0.92
      Random Forest      0.96            0.95       0.93    0.95
            XGBoost      0.92            0.91       0.92    0.90
           LightGBM      0.89            0.88       0.91    0.87
  Gradient Boosting      0.80            0.78       0.84    0.76

Best Model Based on Macro-F1 Score (and Accuracy in case of a tie):
Model             Random Forest
Accuracy                   0.96
Macro-F1 Score             0.95
Precision                  0.93
Recall                     0.95
Name: 2, dtype: object


### Applying SMOTE to the training data for class imbalance and doing hyperparameter tuning for best result

In [21]:
# Loading the encoded train data
train_data = joblib.load('encoded_train_data1.joblib')

# Separating the features (X) and target variable (y)
X = train_data.drop('IncidentGrade', axis=1)
y = train_data['IncidentGrade']

X = X.apply(pd.to_numeric, errors='coerce')
X = X.dropna(axis=1)

# Splitting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Downsampling the training data to 2% for quicker processing
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, train_size=0.02, stratify=y_train, random_state=42)

if X_train_sampled.select_dtypes(include=['bool']).shape[1] > 0:
    X_train_sampled = X_train_sampled.astype(int)

# Applying SMOTE for multi-class classification (default strategy balances all classes equally)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_sampled, y_train_sampled)

# Hyperparameters for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 75],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Random Forest Classifier
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=5,
                                   cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fitting the Randomized Search with resampled training data
random_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and model
best_rf = random_search.best_estimator_

# Evaluating on validation data
y_pred = best_rf.predict(X_val)

# Printing the results
print("Best Hyperparameters:", random_search.best_params_)
print("Classification Report:")
print(classification_report(y_val, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Saving the tuned model
joblib.dump(best_rf, "rf_smote_tuned_model.joblib")
print("Model saved as rf_smote_tuned_model.joblib")

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Hyperparameters: {'n_estimators': 75, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.93      0.90    416110
           1       0.87      0.84      0.85    203156
           2       0.93      0.88      0.91    332418

    accuracy                           0.89    951684
   macro avg       0.89      0.88      0.89    951684
weighted avg       0.90      0.89      0.89    951684

Confusion Matrix:
[[387516  15783  12811]
 [ 24330 170452   8374]
 [ 29291  10034 293093]]
Model saved as rf_smote_tuned_model.joblib


In [25]:
# Loading the encoded train data
train_data = joblib.load('encoded_train_data1.joblib')

# Separating the features (X) and target variable (y)
X = train_data.drop('IncidentGrade', axis=1)
y = train_data['IncidentGrade']

# Splitting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Downsampling the training data to 2% for quicker processing
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, train_size=0.02, stratify=y_train, random_state=42)

# Random Forest without SMOTE
rf_no_smote = RandomForestClassifier(random_state=42, n_jobs=-1)

# Training the model
rf_no_smote.fit(X_train_sampled, y_train_sampled)
y_pred_no_smote = rf_no_smote.predict(X_val)

print("Classification Report Without SMOTE:")
print(classification_report(y_val, y_pred_no_smote))

print("Confusion Matrix Without SMOTE:")
print(confusion_matrix(y_val, y_pred_no_smote))

import joblib
joblib.dump(rf_no_smote, "rf_no_smote_model.joblib")

Classification Report Without SMOTE:
              precision    recall  f1-score   support

           0       0.85      0.96      0.90    416110
           1       0.93      0.80      0.86    203156
           2       0.94      0.87      0.91    332418

    accuracy                           0.89    951684
   macro avg       0.91      0.88      0.89    951684
weighted avg       0.90      0.89      0.89    951684

Confusion Matrix Without SMOTE:
[[398387   7472  10251]
 [ 33405 161865   7886]
 [ 36507   5527 290384]]


['rf_no_smote_model.joblib']

### Evaluation of Best Random Forest Model on Test Data

In [28]:
# Loading the saved Random Forest model
best_rf = joblib.load("rf_smote_tuned_model.joblib")

# Loading the test dataset
test_data = joblib.load('encoded_test_data1.joblib')

# Separateing the features and target from test data
X_test = test_data.drop('IncidentGrade', axis=1)  
y_test = test_data['IncidentGrade']

# Makeing predictions on the test data
y_test_pred = best_rf.predict(X_test)

# Evaluateing the saved model on the test data
print("\nClassification Report on Test Data:")
report = classification_report(y_test, y_test_pred, output_dict=True)
print(classification_report(y_test, y_test_pred))

macro_f1 = report['macro avg']['f1-score']
macro_precision = report['macro avg']['precision']
macro_recall = report['macro avg']['recall']

print("\nMacro-F1 Score: {:.2f}".format(macro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("\nConfusion Matrix on Test Data:")
print(confusion_matrix(y_test, y_test_pred))


Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.90      0.76      0.82   1630942
           1       0.66      0.88      0.75    868897
           2       0.89      0.87      0.88   1422856

    accuracy                           0.83   3922695
   macro avg       0.82      0.84      0.82   3922695
weighted avg       0.84      0.83      0.83   3922695


Macro-F1 Score: 0.82
Macro Precision: 0.82
Macro Recall: 0.84

Confusion Matrix on Test Data:
[[1238955  285289  106698]
 [  65844  761594   41459]
 [  69744  109961 1243151]]


### Applying SMOTE-ENN to the training data for class imbalance and doing hyperparameter tuning for best result
(SMOTE + Edited Nearest Neighbors)
#### SMOTE: Adds synthetic samples to balance the classes.
#### SMOTE-ENN: Adds synthetic samples and then removes noisy or ambiguous samples for better data quality.

In [32]:
# Loading the encoded train data
train_data = joblib.load('encoded_train_data1.joblib')

# Separating the features (X) and target variable (y)
X = train_data.drop('IncidentGrade', axis=1)
y = train_data['IncidentGrade']

X = X.apply(pd.to_numeric, errors='coerce')
X = X.dropna(axis=1)

# Splitting the data (80:20)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Downsampling the training data to 2% for quicker processing
X_train_sampled, _, y_train_sampled, _ = train_test_split(X_train, y_train, train_size=0.02, stratify=y_train, random_state=42)

if X_train_sampled.select_dtypes(include=['bool']).shape[1] > 0:
    X_train_sampled = X_train_sampled.astype(int)

# Applying SMOTE for multi-class classification (default strategy balances all classes equally)
smote_enn = SMOTEENN(random_state=42)
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train_sampled, y_train_sampled)

# Hyperparameters for RandomizedSearchCV
param_dist = {
    'n_estimators': [50, 75],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

# Random Forest Classifier
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=5,
                                   cv=3, verbose=1, random_state=42, n_jobs=-1)

# Fitting the Randomized Search with resampled training data
random_search.fit(X_train_resampled, y_train_resampled)

# Best parameters and model
best_rf = random_search.best_estimator_

# Evaluating on validation data
y_pred = best_rf.predict(X_val)

# Printing the results
print("Best Hyperparameters:", random_search.best_params_)
print("Classification Report:")
print(classification_report(y_val, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# Saving the tuned model
joblib.dump(best_rf, "rf_smote_enn_tuned_model.joblib")
print("Model saved as rf_smote_enn_tuned_model.joblib")

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best Hyperparameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None, 'bootstrap': False}
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.87      0.84    416110
           1       0.70      0.81      0.76    203156
           2       0.94      0.76      0.84    332418

    accuracy                           0.82    951684
   macro avg       0.82      0.81      0.81    951684
weighted avg       0.83      0.82      0.82    951684

Confusion Matrix:
[[361764  43289  11057]
 [ 31838 165559   5759]
 [ 54794  26006 251618]]
Model saved as rf_smote_enn_tuned_model.joblib


In [38]:
#finally predicting on test data using 
# Loading the saved Random Forest model
best_rf = joblib.load("rf_smote_tuned_model.joblib")

# Loading the test dataset
test_data = joblib.load('encoded_test_data1.joblib')

# Separateing the features and target from test data
X_test = test_data.drop('IncidentGrade', axis=1)  
y_test = test_data['IncidentGrade']

# Makeing predictions on the test data
y_test_pred = best_rf.predict(X_test)

# Evaluateing the saved model on the test data
print("\nClassification Report on Test Data:")
report = classification_report(y_test, y_test_pred, output_dict=True)
print(classification_report(y_test, y_test_pred))

macro_f1 = report['macro avg']['f1-score']
macro_precision = report['macro avg']['precision']
macro_recall = report['macro avg']['recall']

print("\nMacro-F1 Score: {:.2f}".format(macro_f1))
print("Macro Precision: {:.2f}".format(macro_precision))
print("Macro Recall: {:.2f}".format(macro_recall))
print("\nConfusion Matrix on Test Data:")
print(confusion_matrix(y_test, y_test_pred))


Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.90      0.76      0.82   1630942
           1       0.66      0.88      0.75    868897
           2       0.89      0.87      0.88   1422856

    accuracy                           0.83   3922695
   macro avg       0.82      0.84      0.82   3922695
weighted avg       0.84      0.83      0.83   3922695


Macro-F1 Score: 0.82
Macro Precision: 0.82
Macro Recall: 0.84

Confusion Matrix on Test Data:
[[1238950  285295  106697]
 [  65849  761590   41458]
 [  69747  109959 1243150]]
