## Building a Classification Model

In [97]:
# importing required packages

import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

In [98]:
# reading data 

train_data = pd.read_csv('GUIDE_Train_cl.csv')
train_data = train_data.iloc[:, 1:]
train_data.head()

Unnamed: 0,Id,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,Category,IncidentGrade,EntityType,EvidenceRole,...,ResourceIdName,OSFamily,OSVersion,CountryCode,State,City,year,month,date,hour
0,249108107265,108,144844,118616,6,5,10,2,20,1,...,3586,5,66,242,1445,10630,2024,6,13,15
1,755914248188,24,71757,48212,1,1,10,2,20,1,...,3586,5,66,242,1445,10630,2024,6,7,0
2,927712937744,45,29568,358611,6,5,10,1,20,1,...,3586,5,66,242,1445,10630,2024,6,11,15
3,1022202220114,3,176549,237522,6,5,10,0,20,1,...,3586,5,66,242,1445,10630,2024,6,5,13
4,300647710795,67,7601,13711,1,1,10,0,20,1,...,3586,5,66,242,1445,10630,2024,6,9,1


In [99]:
test_data = pd.read_csv('GUIDE_Test_cl.csv')
test_data = test_data.iloc[:, 1:]
test_data

Unnamed: 0,Id,OrgId,IncidentId,AlertId,DetectorId,AlertTitle,Category,IncidentGrade,EntityType,EvidenceRole,...,OSFamily,OSVersion,CountryCode,State,City,Usage,year,month,date,hour
0,541165881269,39,435,20106,6,5,10,0,16,1,...,5,66,242,1445,10630,1,2024,6,14,22
1,1013612285007,228,172088,237774,1,1,10,0,16,1,...,5,66,242,1445,10630,0,2024,6,15,22
2,558345751016,28,5647,41104,1,1,10,2,16,1,...,5,66,242,1445,10630,1,2024,6,6,2
3,446676600633,45,34400,80112,6,5,10,1,16,1,...,5,66,242,1445,10630,1,2024,6,11,15
4,1391569404687,107,94343,224156,1,1,10,2,16,1,...,5,66,242,1445,10630,0,2024,6,16,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34825,1657857379583,211,61605,24672,6,5,10,1,16,1,...,5,66,242,1445,10630,1,2024,6,3,15
34826,1125281436185,105,109077,448231,6,5,10,1,16,1,...,5,66,242,1445,10630,1,2024,6,11,19
34827,386547061342,69,51943,226293,1,1,10,0,16,1,...,5,66,242,1445,10630,1,2024,6,8,22
34828,1211180777817,379,694,3411,28,19,10,1,16,1,...,5,66,242,1445,10630,1,2024,6,5,10


In [100]:
train_data['IncidentGrade'].value_counts()

IncidentGrade
0    44274
2    21495
1    13510
Name: count, dtype: int64

In [101]:
# train and test 
x = train_data['IncidentGrade']
y = train_data.drop('IncidentGrade', axis=1)

In [102]:
train_x, test_x, train_y, test_y = train_test_split(y, x, test_size=0.2, random_state=42)

In [103]:
# model building 

logistic_model = LogisticRegression()

logistic_model.fit(train_x, train_y)

In [104]:
# prediction 

train_pred = logistic_model.predict(train_x)
test_pred = logistic_model.predict(test_x)

In [105]:
print(classification_report(train_y, train_pred))

              precision    recall  f1-score   support

           0       0.56      1.00      0.72     35489
           1       0.00      0.00      0.00     10760
           2       0.00      0.00      0.00     17174

    accuracy                           0.56     63423
   macro avg       0.19      0.33      0.24     63423
weighted avg       0.31      0.56      0.40     63423



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [106]:
print(classification_report(test_y, test_pred))

              precision    recall  f1-score   support

           0       0.55      1.00      0.71      8785
           1       0.00      0.00      0.00      2750
           2       0.00      0.00      0.00      4321

    accuracy                           0.55     15856
   macro avg       0.18      0.33      0.24     15856
weighted avg       0.31      0.55      0.40     15856



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Random Forest Classifier

In [108]:
## Random Forest Classifier model

ensemble_model = RandomForestClassifier(n_estimators=500, max_depth=5)

ensemble_model.fit(train_x, train_y)

In [109]:
## predictions

train_pred = ensemble_model.predict(train_x)
test_pred = ensemble_model.predict(test_x)

In [110]:
# train perfromance 

print(classification_report(train_y, train_pred))

              precision    recall  f1-score   support

           0       0.62      1.00      0.77     35489
           1       1.00      0.20      0.33     10760
           2       0.94      0.23      0.37     17174

    accuracy                           0.65     63423
   macro avg       0.85      0.48      0.49     63423
weighted avg       0.77      0.65      0.58     63423



In [111]:
# test performance 

print(classification_report(test_y, test_pred))

              precision    recall  f1-score   support

           0       0.62      1.00      0.76      8785
           1       1.00      0.19      0.33      2750
           2       0.93      0.24      0.38      4321

    accuracy                           0.65     15856
   macro avg       0.85      0.48      0.49     15856
weighted avg       0.77      0.65      0.58     15856



## KNN classification model

In [113]:
neighbor_model = KNeighborsClassifier(n_neighbors=3, n_jobs =-1)

neighbor_model.fit(train_x, train_y)


In [114]:
# predictions 

train_pred_n = neighbor_model.predict(train_x)

print(classification_report(train_y, train_pred_n))


              precision    recall  f1-score   support

           0       0.72      0.87      0.79     35489
           1       0.70      0.48      0.57     10760
           2       0.69      0.53      0.60     17174

    accuracy                           0.71     63423
   macro avg       0.70      0.63      0.65     63423
weighted avg       0.71      0.71      0.70     63423



In [115]:
# predictions
test_pred_n = neighbor_model.predict(test_x)

print(classification_report(test_y, test_pred_n))


              precision    recall  f1-score   support

           0       0.64      0.78      0.70      8785
           1       0.49      0.33      0.39      2750
           2       0.47      0.36      0.40      4321

    accuracy                           0.58     15856
   macro avg       0.53      0.49      0.50     15856
weighted avg       0.56      0.58      0.57     15856



## Decision Tree Classifeir

In [117]:
## Decision Tree Classifer

tree = DecisionTreeClassifier( max_depth=5, criterion='entropy')
tree.fit(train_x, train_y)

In [118]:
# predictions 

train_pred = tree.predict(train_x)
test_pred = tree.predict(test_x)

In [119]:
# train_score
print(classification_report(train_y, train_pred))

              precision    recall  f1-score   support

           0       0.66      0.97      0.79     35489
           1       0.89      0.39      0.55     10760
           2       0.82      0.32      0.46     17174

    accuracy                           0.70     63423
   macro avg       0.79      0.56      0.60     63423
weighted avg       0.74      0.70      0.66     63423



In [120]:
# test score

print(classification_report(test_y, test_pred))

              precision    recall  f1-score   support

           0       0.66      0.97      0.79      8785
           1       0.90      0.39      0.55      2750
           2       0.83      0.34      0.48      4321

    accuracy                           0.70     15856
   macro avg       0.80      0.57      0.60     15856
weighted avg       0.75      0.70      0.66     15856



## Hyperparameter Tuning

In [122]:
param_grid = {
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf' : [1, 5, 10],
    'max_features':['sqrt', 'log2', None]
}

tree = DecisionTreeClassifier(criterion='entropy', random_state=42)
grid_search = GridSearchCV(tree, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(train_x, train_y)

print(f'best parameters: {grid_search.best_params_}')
print(f'best accuracy: {grid_search.best_score_}')

best parameters: {'max_depth': 10, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5}
best accuracy: 0.809785044476407


In [123]:
# prediction

train_pred = grid_search.predict(train_x)
test_pred = grid_search.predict(test_x)

In [124]:
# train scores

print(classification_report(train_y, train_pred))

              precision    recall  f1-score   support

           0       0.78      0.97      0.87     35489
           1       0.92      0.60      0.72     10760
           2       0.91      0.64      0.75     17174

    accuracy                           0.82     63423
   macro avg       0.87      0.74      0.78     63423
weighted avg       0.84      0.82      0.81     63423



In [125]:
# test scores

print(classification_report(test_y, test_pred))

              precision    recall  f1-score   support

           0       0.77      0.97      0.86      8785
           1       0.92      0.59      0.72      2750
           2       0.91      0.64      0.75      4321

    accuracy                           0.82     15856
   macro avg       0.87      0.73      0.78     15856
weighted avg       0.84      0.82      0.81     15856



## XGBoost Algorithm

In [127]:
import xgboost as xgb
from xgboost import XGBClassifier

In [128]:
xgb_model = XGBClassifier(n_estimators=300, max_depth=5, learning_rate=0.1, subsample=0.8, random_state=42, eval_metric='logloss')

xgb_model.fit(train_x, train_y)

In [129]:
# predictions

train_pred = xgb_model.predict(train_x)

print(classification_report(train_y, train_pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95     35489
           1       0.91      0.83      0.87     10760
           2       0.92      0.89      0.90     17174

    accuracy                           0.92     63423
   macro avg       0.92      0.90      0.91     63423
weighted avg       0.92      0.92      0.92     63423



In [130]:
test_pred = xgb_model.predict(test_x)
print(classification_report(test_y, test_pred))

              precision    recall  f1-score   support

           0       0.91      0.96      0.94      8785
           1       0.91      0.80      0.85      2750
           2       0.90      0.87      0.88      4321

    accuracy                           0.91     15856
   macro avg       0.91      0.88      0.89     15856
weighted avg       0.91      0.91      0.91     15856



## testing the model in test data

In [131]:
test_val = test_data.drop(['IncidentGrade', 'Usage'],axis=1)

In [132]:
test_predictions = xgb_model.predict(test_val)

In [133]:
print(classification_report(test_predictions, test_data['IncidentGrade']))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91     19741
           1       0.68      0.81      0.74      5214
           2       0.84      0.81      0.83      9875

    accuracy                           0.86     34830
   macro avg       0.81      0.84      0.82     34830
weighted avg       0.86      0.86      0.86     34830



## Scores 

Logistic Regression:
* train score = 56%
* test score = 55%

Random forest classifier:
* train score = 65%
* test score = 65%

Decision tree classifier:
* train score = 70%
* test score = 70%

Kneighbors classifier:
* train score = 71%
* test score = 58%

Grid Search CV
* train score = 82%
* test score = 82%

XGBoost Classifer:
* train score = 92%
* test score = 91%

## The score of test data when xgboost used is = 86%