# **Imports**

In [2]:
import pandas as pd 
import numpy as np
import seaborn as sns 
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, f1_score,roc_auc_score, roc_curve,recall_score,precision_score
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV,train_test_split
import os
from sklearn.tree import DecisionTreeClassifier
from google.colab import drive
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, LabelEncoder

# **Reading Data**

In [3]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
df = pd.read_csv("/content/gdrive/MyDrive/the projects/Acc/CleanACC.csv")

In [5]:
data = df.copy()

In [6]:
def normalize_data(df, train=True):
  numerical_columns = [col for col in df.columns if df[col].dtype != 'object']
  numerical_columns.remove("Casualty Severity")
  if train == True:
    for col in numerical_columns:
      df[col] = norm_scale.fit_transform(np.array(df[col]).reshape(-1,1))
    return df
  else: 
    for col in numerical_columns:
      df[col] = norm_scale.transform(np.array(df[col]).reshape(-1,1))
    return df

In [7]:
def standarize_data(df, train=True):
  numerical_columns = [col for col in df.columns if df[col].dtype != 'object']
  numerical_columns.remove("Casualty Severity")
  if train == True:
    for col in numerical_columns:
      df[col] = stand_scale.fit_transform(np.array(df[col]).reshape(-1,1))
    return df
  else : 
    for col in numerical_columns:
      df[col] = stand_scale.transform(np.array(df[col]).reshape(-1,1))
    return df

# **Modelling**

### **SPLITTING**

In [8]:
train_df, val_df = train_test_split(data,test_size=0.05,shuffle=True ,random_state=50 ,stratify=data["Casualty Severity"]) 

x_train = train_df.drop(columns=['Casualty Severity'])
y_train = train_df['Casualty Severity']

x_val = val_df.drop(columns=['Casualty Severity'])
y_val = val_df['Casualty Severity']

In [9]:
df_train = pd.concat([x_train,y_train], axis=1)

df_train["Casualty Severity"].value_counts()

2    2188
1     252
0      24
Name: Casualty Severity, dtype: int64

In [10]:
df_valid = pd.concat([x_val,y_val], axis=1)

df_valid["Casualty Severity"].value_counts()

2    116
1     13
0      1
Name: Casualty Severity, dtype: int64

In [11]:
global stand_scale
global norm_scale

In [12]:
stand_scale = StandardScaler()
norm_scale = MinMaxScaler()

In [13]:
df_norm_train = normalize_data(df_train, train=True)
df_norm_valid = normalize_data(df_valid, train=False)

In [14]:
xtrain= df_norm_train.drop(columns= ["Casualty Severity",'Accident Date'])
ytrain= df_norm_train["Casualty Severity"]
ytrain=ytrain.astype(int)
ytrain.value_counts()

2    2188
1     252
0      24
Name: Casualty Severity, dtype: int64

In [15]:
xtest= df_norm_valid.drop(columns=["Casualty Severity",'Accident Date' ] )
ytest= df_norm_valid["Casualty Severity"]
ytest=ytest.astype(int)
ytest.value_counts()

2    116
1     13
0      1
Name: Casualty Severity, dtype: int64

## **RandomForest**

In [16]:
modelRFC = RandomForestClassifier(n_estimators=200)

In [17]:
modelRFC.fit(xtrain,ytrain)
predsRFC = modelRFC.predict(xtest)

## **AdaBoostClassifier**

In [18]:
modelADA = AdaBoostClassifier(n_estimators=200)

In [19]:
modelADA.fit(xtrain,ytrain)
predsADA= modelADA.predict(xtest)

## **BaggingClassifier**

In [20]:
modelBagging= BaggingClassifier(n_estimators=200)

In [21]:
modelBagging.fit(xtrain,ytrain)
predsBag= modelBagging.predict(xtest)

## **LGBMClassifier**

In [22]:
modelLGBM= LGBMClassifier()

In [23]:
modelLGBM.fit(xtrain,ytrain)
predsLGBM= modelLGBM.predict(xtest)

## **SVC**

In [24]:
modelSVC= SVC()

In [25]:
modelSVC.fit(xtrain,ytrain)
predsSVC= modelSVC.predict(xtest)

In [26]:
print('recall_score for RandomForestClassifier', recall_score(ytest,predsRFC,average='micro'))
print('recall_score for AdaBoostClassifier', recall_score(ytest,predsADA,average='micro'))
print('recall_score for BaggingClassifier', recall_score(ytest,predsBag,average='micro'))
print('recall_score for LGBMClassifier', recall_score(ytest,predsLGBM,average='micro'))
print('recall_score for SVC', recall_score(ytest,predsSVC,average='micro'))

recall_score for RandomForestClassifier 0.8615384615384616
recall_score for AdaBoostClassifier 0.8384615384615385
recall_score for BaggingClassifier 0.8153846153846154
recall_score for LGBMClassifier 0.8923076923076924
recall_score for SVC 0.8923076923076924
