In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, classification_report

In [3]:
train_data = pd.read_csv('train_encoded.csv')

In [5]:
test_data = pd.read_csv('test_encoded.csv')

In [7]:
# Target and Value data

value_train = train_data.drop('IncidentGrade', axis = 1)
tar_train = train_data['IncidentGrade']

value_test = test_data.drop('IncidentGrade', axis = 1)
tar_test = test_data['IncidentGrade']

# Logistic Regression

In [9]:
model1 = LogisticRegression()

In [11]:
model1.fit(value_train, tar_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
train_pred = model1.predict(value_train)
test_pred = model1.predict(value_test)

In [45]:
# Performance Supporting Function

def get_perform(actual, pred):
    acc = accuracy_score(actual, pred) * 100
    prec = precision_score(actual, pred, average='weighted', zero_division=0.0) * 100 
    rec = recall_score(actual, pred, average='weighted') * 100 
    f1 = f1_score(actual, pred, average='weighted') * 100
    return pd.DataFrame({
        "accuracy": [acc],
        "precision": [prec],
        "recall": [rec],
        "F1 score" : [f1],
    })

In [47]:
get_perform(tar_train, train_pred)

Unnamed: 0,accuracy,precision,recall,F1 score
0,47.572044,37.001051,47.572044,40.818378


In [49]:
get_perform(tar_test, test_pred)

Unnamed: 0,accuracy,precision,recall,F1 score
0,47.862701,37.421913,47.862701,41.247446


# Decision Tree Classifier

In [53]:
model2 = DecisionTreeClassifier(max_depth= 25,criterion='gini',random_state= 56)

In [55]:
model2.fit(value_train, tar_train)

In [58]:
train_predr = model2.predict(value_train)

In [60]:
test_predr = model2.predict(value_test)

In [62]:
get_perform(tar_train, train_predr)

Unnamed: 0,accuracy,precision,recall,F1 score
0,95.576658,95.588718,95.576658,95.568643


In [64]:
get_perform(tar_test, test_predr)

Unnamed: 0,accuracy,precision,recall,F1 score
0,88.174254,88.207181,88.174254,88.183742


# XGBoost classifier

In [66]:
model3 = XGBClassifier(n_estimators=200, random_state=56, max_depth=10)

In [68]:
model3.fit(value_train, tar_train)

In [70]:
train_predx = model3.predict(value_train)
test_predx = model3.predict(value_test)

In [73]:
get_perform(tar_train, train_predx)

Unnamed: 0,accuracy,precision,recall,F1 score
0,97.492775,97.493725,97.492775,97.490989


In [75]:
get_perform(tar_test, test_predx)

Unnamed: 0,accuracy,precision,recall,F1 score
0,91.608687,91.579035,91.608687,91.587538


# Model Comparison

In [77]:
model1_perf = get_perform(tar_test, test_pred)
model2_perf = get_perform(tar_test, test_predr)
model3_perf = get_perform(tar_test, test_predx)

In [79]:
performance_df = pd.concat([model1_perf, model2_perf, model3_perf], 
                          keys=['Model 1', 'Model 2', 'Model 3'], 
                          names=['Model', None])

In [81]:
performance_df

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,precision,recall,F1 score
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Model 1,0,47.862701,37.421913,47.862701,41.247446
Model 2,0,88.174254,88.207181,88.174254,88.183742
Model 3,0,91.608687,91.579035,91.608687,91.587538
