In [None]:
import numpy as np
import pandas as pd # data processing (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


%matplotlib inline


In [None]:
import requests
import os
from tqdm import tqdm

In [None]:
# function for input to tqdm
def download_dataset(file_url, folder_path, name):
    file_path = os.path.join(folder_path, name)
    print(folder_path, file_path)
    
    r = requests.get(file_url, stream=True)
    
    with open(file_path, "wb") as file:
        for chunk in tqdm(r.iter_content(chunk_size=1024)):
             if chunk: file.write(chunk)

    print(f'Download complete. File saved to: {file_path}')

In [None]:
backend_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(backend_dir, 'data')
print(data_dir)

In [None]:
download_dataset("https://bitbucket.org/abdulwahed11314/accidents-data/raw/b7add9860d310171bca48bcaefeae37fe5157ac3/CasualtiesBig.csv", data_dir, 'casualties.csv')
download_dataset("https://bitbucket.org/abdulwahed11314/accidents-data/raw/b7add9860d310171bca48bcaefeae37fe5157ac3/AccidentsBig.csv", data_dir, 'accidents.csv')
download_dataset("https://bitbucket.org/abdulwahed11314/accidents-data/raw/b7add9860d310171bca48bcaefeae37fe5157ac3/VehiclesBig.csv", data_dir, 'vehicles.csv')

In [None]:
accidents_csv_file_path = os.path.join(data_dir, 'accidents.csv')
vehicles_csv_file_path = os.path.join(data_dir, 'vehicles.csv')
casualties_csv_file_path = os.path.join(data_dir, 'casualties.csv')

In [None]:
accidents  = pd.read_csv(accidents_csv_file_path,  index_col='Accident_Index', on_bad_lines='skip')
vehicles   = pd.read_csv(vehicles_csv_file_path,   index_col='Accident_Index', on_bad_lines='skip')
casualties = pd.read_csv(casualties_csv_file_path, index_col='Accident_Index', on_bad_lines='skip')

In [None]:
first_df = pd.merge(accidents, casualties, on='Accident_Index')
df = pd.merge(first_df, vehicles, on='Accident_Index')

In [None]:
df.info()

In [None]:
print("Number of rows:", df.shape[0])

In [None]:
# filtered_df = df[df['Vehicle_Type'] == 11]
# Assuming you have the filtered_df DataFrame with the 'Accident_Severity' column

input_columns = ['Age_Band_of_Driver', 'Sex_of_Driver', 'Vehicle_Type',
                 'Road_Type', 'Speed_limit', 'Junction_Control', 'Light_Conditions',
                 'Weather_Conditions', 'Road_Surface_Conditions', 'Urban_or_Rural_Area','Accident_Severity']

accident_ml = df[input_columns]

In [None]:
accident_ml.info()

In [None]:
input_columns = ['Age_Band_of_Driver', 'Sex_of_Driver', 'Vehicle_Type',
                 'Road_Type', 'Speed_limit', 'Junction_Control', 'Light_Conditions',
                 'Weather_Conditions', 'Road_Surface_Conditions', 'Urban_or_Rural_Area']
target_column = 'Accident_Severity'



mask = (accident_ml[input_columns] == -1).any(axis=1)

# Use the mask to drop rows with -1 values
cleaned_accident_ml = accident_ml[~mask]

sev1=accident_ml[accident_ml['Accident_Severity'] == 1].head(100000)
sev2=accident_ml[accident_ml['Accident_Severity'] == 2].head(40000)
sev3=accident_ml[accident_ml['Accident_Severity'] == 3].head(40000)

print(sev1.shape[0])
print(sev2.shape[0])
print(sev3.shape[0])

combined_df = pd.concat([sev1, sev2, sev3], axis=0)

# Reset the index if needed
combined_df.reset_index(drop=True, inplace=True)

# Prepare the data
#X = cleaned_accident_ml[input_columns]
#y = cleaned_accident_ml[target_column]

X = combined_df[input_columns]
y = combined_df[target_column]

missing_values1 = X.isnull().sum()
missing_values2 = y.isnull().sum()
print(accident_ml.shape[0])
print(X.shape[0])
print(y.shape[0])

In [None]:
print(X.shape[0])
print(y.shape[0])

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, shuffle=True, random_state=99)

random_forest = RandomForestClassifier(n_estimators=200)

random_forest.fit(X_train,y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_test, y_test)
acc_random_forest1 = round(random_forest.score(X_test, y_test) * 100, 2)

sk_report = classification_report(
    digits=6,
    y_true=y_test,
    y_pred=Y_pred)
print("Accuracy" , acc_random_forest1)
print(sk_report)
pd.crosstab(y_test, Y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

print("done")

### this saves the model into a file 'accident_prediction_model.pkl'

In [None]:
import joblib
models_dir = os.path.join(backend_dir, 'models')
file_name = 'accident_prediction_model.pkl'
file_path = os.path.join(models_dir, file_name)
joblib.dump(random_forest, file_path) 

# testing the model with sample input data

In [None]:
#sample = [4,1,11,2,30,2,1,1,1,1]

#sample = [8, 1, 11, 7, 30, 2, 5, 2, 5,1]

sample = [7, 1, 11, 
          2 ,30, 4, 1, 
          1, 1,1]

'''
['Age_Band_of_Driver', 'Sex_of_Driver', 'Vehicle_Type',
'Road_Type', 'Speed_limit', 'Junction_Control', 'Light_Conditions',
'Weather_Conditions', 'Road_Surface_Conditions', 'Urban_or_Rural_Area']
'''

# Reshape the array
sample = np.array(sample).reshape(1, -1)

print(sample)

result = random_forest.predict(sample)
print("done")

predicted_class = result[0]

print("Predicted Class:", predicted_class)


In [None]:
sample_input = [[7, 1, 11, 2, 30, 4, 1, 1, 1, 1]]  # Replace with your actual sample input

# Calculate class probabilities for the sample input
class_probabilities = random_forest.predict_proba(sample_input)

# Print the probabilities for each class
for class_label, probability in enumerate(class_probabilities[0]):
    print(f"Probability of being in Class {class_label + 1}: {probability:.6f}")

In [None]:
# Sample class probabilities (replace with actual class probabilities)
probability_fatal = class_probabilities[0][0]  # Probability for fatal accident
probability_serious = class_probabilities[0][1]  # Probability for serious accident
probability_slight = class_probabilities[0][2]  # Probability for slight accident

# Define weights for each severity class
w_fatal = 0.5  # Weight for fatal accidents
w_serious = 0.4  # Weight for serious accidents
w_slight = 0.3  # Weight for slight accidents

# Calculate the combined probability that an accident will happen
combined_probability = (w_fatal * probability_fatal +
                        w_serious * probability_serious +
                        w_slight * probability_slight)
print(f"Combined Probability of an Accident: {combined_probability:.6f}")
