In [1]:
import numpy as np
import pandas as pd # data processing (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix


%matplotlib inline


In [2]:
import requests
import os
from tqdm import tqdm

In [3]:
# function for input to tqdm
def download_dataset(file_url, folder_path, name):
    file_path = os.path.join(folder_path, name)
    print(folder_path, file_path)
    
    r = requests.get(file_url, stream=True)
    
    with open(file_path, "wb") as file:
        for chunk in tqdm(r.iter_content(chunk_size=1024)):
             if chunk: file.write(chunk)

    print(f'Download complete. File saved to: {file_path}')

In [4]:
backend_dir = os.path.dirname(os.getcwd())
data_dir = os.path.join(backend_dir, 'data')
print(data_dir)

d:\jman\jman-backend\data


In [5]:
download_dataset("https://bitbucket.org/abdulwahed11314/accidents-data/raw/b7add9860d310171bca48bcaefeae37fe5157ac3/CasualtiesBig.csv", data_dir, 'casualties.csv')
download_dataset("https://bitbucket.org/abdulwahed11314/accidents-data/raw/b7add9860d310171bca48bcaefeae37fe5157ac3/AccidentsBig.csv", data_dir, 'accidents.csv')
download_dataset("https://bitbucket.org/abdulwahed11314/accidents-data/raw/b7add9860d310171bca48bcaefeae37fe5157ac3/VehiclesBig.csv", data_dir, 'vehicles.csv')

d:\jman\jman-backend\data d:\jman\jman-backend\data\casualties.csv


103368it [00:23, 4353.74it/s]


Download complete. File saved to: d:\jman\jman-backend\data\casualties.csv
d:\jman\jman-backend\data d:\jman\jman-backend\data\accidents.csv


237031it [00:48, 4911.94it/s]


Download complete. File saved to: d:\jman\jman-backend\data\accidents.csv
d:\jman\jman-backend\data d:\jman\jman-backend\data\vehicles.csv


198729it [00:50, 3905.97it/s]

Download complete. File saved to: d:\jman\jman-backend\data\vehicles.csv





In [6]:
accidents_csv_file_path = os.path.join(data_dir, 'accidents.csv')
vehicles_csv_file_path = os.path.join(data_dir, 'vehicles.csv')
casualties_csv_file_path = os.path.join(data_dir, 'casualties.csv')

In [7]:
accidents  = pd.read_csv(accidents_csv_file_path,  index_col='Accident_Index', on_bad_lines='skip')
vehicles   = pd.read_csv(vehicles_csv_file_path,   index_col='Accident_Index', on_bad_lines='skip')
casualties = pd.read_csv(casualties_csv_file_path, index_col='Accident_Index', on_bad_lines='skip')

In [8]:
first_df = pd.merge(accidents, casualties, on='Accident_Index')
df = pd.merge(first_df, vehicles, on='Accident_Index')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4287593 entries, 200501BS00001 to 2014984139614
Data columns (total 66 columns):
 #   Column                                       Dtype  
---  ------                                       -----  
 0   Location_Easting_OSGR                        float64
 1   Location_Northing_OSGR                       float64
 2   Longitude                                    float64
 3   Latitude                                     float64
 4   Police_Force                                 int64  
 5   Accident_Severity                            int64  
 6   Number_of_Vehicles                           int64  
 7   Number_of_Casualties                         int64  
 8   Date                                         object 
 9   Day_of_Week                                  int64  
 10  Time                                         object 
 11  Local_Authority_(District)                   int64  
 12  Local_Authority_(Highway)                    object 
 13 

In [10]:
print("Number of rows:", df.shape[0])

Number of rows: 4287593


In [11]:
# filtered_df = df[df['Vehicle_Type'] == 11]
# Assuming you have the filtered_df DataFrame with the 'Accident_Severity' column

input_columns = ['Age_Band_of_Driver', 'Sex_of_Driver', 'Vehicle_Type',
                 'Road_Type', 'Speed_limit', 'Junction_Control', 'Light_Conditions',
                 'Weather_Conditions', 'Road_Surface_Conditions', 'Urban_or_Rural_Area','Accident_Severity']

accident_ml = df[input_columns]

In [12]:
accident_ml.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4287593 entries, 200501BS00001 to 2014984139614
Data columns (total 11 columns):
 #   Column                   Dtype
---  ------                   -----
 0   Age_Band_of_Driver       int64
 1   Sex_of_Driver            int64
 2   Vehicle_Type             int64
 3   Road_Type                int64
 4   Speed_limit              int64
 5   Junction_Control         int64
 6   Light_Conditions         int64
 7   Weather_Conditions       int64
 8   Road_Surface_Conditions  int64
 9   Urban_or_Rural_Area      int64
 10  Accident_Severity        int64
dtypes: int64(11)
memory usage: 392.5+ MB


In [13]:
input_columns = ['Age_Band_of_Driver', 'Sex_of_Driver', 'Vehicle_Type',
                 'Road_Type', 'Speed_limit', 'Junction_Control', 'Light_Conditions',
                 'Weather_Conditions', 'Road_Surface_Conditions', 'Urban_or_Rural_Area']
target_column = 'Accident_Severity'



mask = (accident_ml[input_columns] == -1).any(axis=1)

# Use the mask to drop rows with -1 values
cleaned_accident_ml = accident_ml[~mask]

sev1=accident_ml[accident_ml['Accident_Severity'] == 1].head(100000)
sev2=accident_ml[accident_ml['Accident_Severity'] == 2].head(40000)
sev3=accident_ml[accident_ml['Accident_Severity'] == 3].head(40000)

print(sev1.shape[0])
print(sev2.shape[0])
print(sev3.shape[0])

combined_df = pd.concat([sev1, sev2, sev3], axis=0)

# Reset the index if needed
combined_df.reset_index(drop=True, inplace=True)

# Prepare the data
#X = cleaned_accident_ml[input_columns]
#y = cleaned_accident_ml[target_column]

X = combined_df[input_columns]
y = combined_df[target_column]

missing_values1 = X.isnull().sum()
missing_values2 = y.isnull().sum()
print(accident_ml.shape[0])
print(X.shape[0])
print(y.shape[0])

83607
40000
40000
4287593
163607
163607


In [14]:
print(X.shape[0])
print(y.shape[0])

163607
163607


In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, shuffle=True, random_state=99)

random_forest = RandomForestClassifier(n_estimators=200)

random_forest.fit(X_train,y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_test, y_test)
acc_random_forest1 = round(random_forest.score(X_test, y_test) * 100, 2)

sk_report = classification_report(
    digits=6,
    y_true=y_test,
    y_pred=Y_pred)
print("Accuracy" , acc_random_forest1)
print(sk_report)
pd.crosstab(y_test, Y_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

print("done")

Accuracy 69.42
              precision    recall  f1-score   support

           1   0.781931  0.837864  0.808932     16776
           2   0.549706  0.268841  0.361088      8001
           3   0.600665  0.819006  0.693045      7945

    accuracy                       0.694151     32722
   macro avg   0.644101  0.641904  0.621022     32722
weighted avg   0.681137  0.694151  0.671290     32722

done


### this saves the model into a file 'accident_prediction_model.pkl'

In [17]:
import joblib
models_dir = os.path.join(backend_dir, 'models')
file_name = 'accident_prediction_model.pkl'
file_path = os.path.join(models_dir, file_name)
joblib.dump(random_forest, file_path) 

['d:\\jman\\jman-backend\\models\\accident_prediction_model.pkl']

# testing the model with sample input data

In [18]:
#sample = [4,1,11,2,30,2,1,1,1,1]

#sample = [8, 1, 11, 7, 30, 2, 5, 2, 5,1]

sample = [7, 1, 11, 
          2 ,30, 4, 1, 
          1, 1,1]

'''
['Age_Band_of_Driver', 'Sex_of_Driver', 'Vehicle_Type',
'Road_Type', 'Speed_limit', 'Junction_Control', 'Light_Conditions',
'Weather_Conditions', 'Road_Surface_Conditions', 'Urban_or_Rural_Area']
'''

# Reshape the array
sample = np.array(sample).reshape(1, -1)

print(sample)

result = random_forest.predict(sample)
print("done")

predicted_class = result[0]

print("Predicted Class:", predicted_class)


[[ 7  1 11  2 30  4  1  1  1  1]]
done
Predicted Class: 3




In [19]:
sample_input = [[7, 1, 11, 2, 30, 4, 1, 1, 1, 1]]  # Replace with your actual sample input

# Calculate class probabilities for the sample input
class_probabilities = random_forest.predict_proba(sample_input)

# Print the probabilities for each class
for class_label, probability in enumerate(class_probabilities[0]):
    print(f"Probability of being in Class {class_label + 1}: {probability:.6f}")

Probability of being in Class 1: 0.294271
Probability of being in Class 2: 0.298233
Probability of being in Class 3: 0.407497




In [20]:
# Sample class probabilities (replace with actual class probabilities)
probability_fatal = class_probabilities[0][0]  # Probability for fatal accident
probability_serious = class_probabilities[0][1]  # Probability for serious accident
probability_slight = class_probabilities[0][2]  # Probability for slight accident

# Define weights for each severity class
w_fatal = 0.5  # Weight for fatal accidents
w_serious = 0.4  # Weight for serious accidents
w_slight = 0.3  # Weight for slight accidents

# Calculate the combined probability that an accident will happen
combined_probability = (w_fatal * probability_fatal +
                        w_serious * probability_serious +
                        w_slight * probability_slight)
print(f"Combined Probability of an Accident: {combined_probability:.6f}")


Combined Probability of an Accident: 0.388677
