In [127]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, classification_report

In [67]:
#load dataset

df = pd.read_csv('accident.csv')
df.head()

Unnamed: 0,Age,Gender,Speed_of_Impact,Helmet_Used,Seatbelt_Used,Survived
0,56,Female,27.0,No,No,1
1,69,Female,46.0,No,Yes,1
2,46,Male,46.0,Yes,Yes,0
3,32,Male,117.0,No,Yes,0
4,60,Female,40.0,Yes,Yes,0


In [68]:
#clean dataset remove the data without data 'Gender'
df['Gender'].isna().sum()
df = df.dropna(subset=['Gender'])


In [69]:
#clean dataset fill the missing data with median 'Speed_of_Impact'
df['Speed_of_Impact'].isna().sum()
df['Speed_of_Impact'] = df['Speed_of_Impact'].fillna(df['Speed_of_Impact'].median())

In [70]:
#clean dataset encode categorical data 'Gender', 'Helmet_Used', 'Seatbelt_Used'

df['Helmet_Used'] = df['Helmet_Used'].map({'Yes': 1, 'No': 0}) #set 'Yes' as 1

df['Seatbelt_Used'] = df['Seatbelt_Used'].map({'Yes': 1, 'No': 0}) #set 'Yes' as 1
 
df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0}) #set 'Male' as 1

df.head()

Unnamed: 0,Age,Gender,Speed_of_Impact,Helmet_Used,Seatbelt_Used,Survived
0,56,0,27.0,0,0,1
1,69,0,46.0,0,1,1
2,46,1,46.0,1,1,0
3,32,1,117.0,0,1,0
4,60,0,40.0,1,1,0


In [118]:
#spliting the dataset

X = df[['Age', 'Gender', 'Speed_of_Impact', 'Helmet_Used', 'Seatbelt_Used']]
y = df['Survived']

#spliting the dataset into train and test with test_size=0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [119]:
df.count()

Age                199
Gender             199
Speed_of_Impact    199
Helmet_Used        199
Seatbelt_Used      199
Survived           199
dtype: int64

In [120]:
#scale the data

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [129]:
#defining three models with three hyperparameters, and evaluation scores

RandomForest = RandomForestClassifier(random_state=42)
params_r= {'n_estimators': [50, 100, 200],
           'max_depth': [None, 10, 20],
           'min_samples_split': [2, 5, 10],
           'criterion': ['gini', 'entropy', 'log_loss']
          }

LogisticRegression = LogisticRegression()
params_l = {'C': [0.1, 1, 10],  
            'solver': ['lbfgs', 'liblinear', 'saga'], 
            'max_iter': [100, 500, 1000]
           }

KNN = KNeighborsClassifier()
params_k = {'n_neighbors': [3, 5, 10],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'cosine', 'manhattan']
           }

scorers = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score),
           'recall': make_scorer(recall_score),
           'f1': make_scorer(f1_score),
           'mcc': make_scorer(matthews_corrcoef)
          }

In [150]:
#train the random forest model and predict results with the best parameters
grid_search_rf = GridSearchCV(estimator=RandomForest, param_grid=params_r, cv=5, n_jobs=-1, scoring=scorers, refit='accuracy')

# Fit GridSearchCV to the training data
y_pred_rf = grid_search_rf.fit(X_train, y_train)

# Best hyperparameters and performance
print("Best hyperparameters:", grid_search_rf.best_params_)

# Best model performance on the test set
best_rf = grid_search_rf.best_estimator_

y_pred_rf = best_rf.predict(X_test)

# Evaluate the best model on the test set using accuracy, precision, recall, f1, and mcc
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)
mcc = matthews_corrcoef(y_test, y_pred_rf)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")


Best hyperparameters: {'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Test Accuracy: 0.5000
Test Precision: 0.4783
Test Recall: 0.5789
Test F1 Score: 0.5238
Matthews Correlation Coefficient (MCC): 0.0076


In [152]:
#train the logistic regression model and predict results with the best parameters
grid_search_lr = GridSearchCV(estimator=LogisticRegression, param_grid=params_l, cv=5, n_jobs=-1, scoring=scorers, refit='accuracy')

# Fit GridSearchCV to the training data
grid_search_lr.fit(X_train, y_train)

# Best hyperparameters and performance
print("Best hyperparameters:", grid_search_lr.best_params_)

# Best model performance on the test set
best_lr = grid_search_lr.best_estimator_
y_pred_lr = best_lr.predict(X_test)

# Evaluate the best model on the test set using accuracy, precision, recall, f1, and mcc
accuracy = accuracy_score(y_test, y_pred_lr)
precision = precision_score(y_test, y_pred_lr)
recall = recall_score(y_test, y_pred_lr)
f1 = f1_score(y_test, y_pred_lr)
mcc = matthews_corrcoef(y_test, y_pred_lr)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")



Best hyperparameters: {'C': 0.1, 'max_iter': 100, 'solver': 'liblinear'}
Test Accuracy: 0.5250
Test Precision: 0.5000
Test Recall: 0.5789
Test F1 Score: 0.5366
Matthews Correlation Coefficient (MCC): 0.0553




In [153]:
#train the KNN model and predict results with the best parameters
grid_search_k = GridSearchCV(estimator=KNN, param_grid=params_k, cv=5, n_jobs=-1, scoring=scorers, refit='accuracy')

# Fit GridSearchCV to the training data
y_pred = grid_search_k.fit(X_train, y_train)

# Best hyperparameters and performance
print("Best hyperparameters:", grid_search_k.best_params_)

# Best model performance on the test set
best_k = grid_search_k.best_estimator_
y_pred_k = best_k.predict(X_test)

# Evaluate the best model on the test set using accuracy, precision, recall, f1, and mcc
accuracy = accuracy_score(y_test, y_pred_k)
precision = precision_score(y_test, y_pred_k)
recall = recall_score(y_test, y_pred_k)
f1 = f1_score(y_test, y_pred_k)
mcc = matthews_corrcoef(y_test, y_pred_k)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")

Best hyperparameters: {'metric': 'cosine', 'n_neighbors': 5, 'weights': 'uniform'}
Test Accuracy: 0.5750
Test Precision: 0.5625
Test Recall: 0.4737
Test F1 Score: 0.5143
Matthews Correlation Coefficient (MCC): 0.1431
