# Importing Dataset and Libraries


In [1]:
import os
import pandas as pd
import seaborn as sns
from yellowbrick.target import FeatureCorrelation
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
original_data_directory = "ELG-5142-Project-Group-6/"

if not(os.path.exists(original_data_directory.split("/")[0])):
  !git clone https://github.com/ANISHG-26/ELG-5142-Project-Group-6.git
os.chdir(original_data_directory)

In [3]:
data = "2019_Tabular_Transportation_Collision_Data.csv"
df = pd.read_csv(data)

In [4]:
df.columns

Index(['Anom_ID', 'Accident_Date', 'Accident_Time', 'Location', 'Geo_ID',
       'Accident_Location', 'Classification_of_Accident',
       'Initial_Impact_Type', 'Environment_Condition', 'Light',
       'Road_Surface_Condition', 'Traffic_Control',
       'Traffic_Control_Condition', 'No__of_Vehicles', 'No__of_Bicycles',
       'No__of_Motorcycles', 'No__of_Pedestrians', 'Max_Injury',
       'No__of_Injuries', 'No__of_Minimal', 'No__of_Minor', 'No__of_Major',
       'No__of_Fatal', 'X', 'Y', 'Latitude', 'Longitude', 'ObjectId'],
      dtype='object')

In [None]:
df.info()

In [None]:
df.head().T

In [None]:
df.describe().T

In [None]:
print(df.isnull().sum())

In [None]:
df.Geo_ID.value_counts()

__3ZA2Y4    61
2237        58
__3ZA2R8    57
12421       48
2707        43
            ..
3716         1
__3ZBNEB     1
__3ZA3KQ     1
11540        1
__3ZA599     1
Name: Geo_ID, Length: 6072, dtype: int64

# Dropping Unrelated Columns and adding Injured Column


In [5]:
df.drop(['Max_Injury', 'No__of_Minimal', 'No__of_Minor', 'No__of_Major', 'No__of_Fatal', 'Anom_ID','ObjectId'], axis=1, inplace=True) #,'Accident_Date','Accident_Time','Location','Geo_ID'], axis=1, inplace=True)

In [6]:
df.drop(['Accident_Date','Accident_Time','Location','Geo_ID'], axis=1, inplace=True)

In [7]:
df['Injured'] = df['No__of_Injuries'].apply(lambda x: 1 if x > 0 else 0)

In [8]:
df.Injured.value_counts()

0    13678
1     2721
Name: Injured, dtype: int64

In [None]:
df[df['No__of_Injuries'] > 1].T

In [None]:
sns.heatmap(df.corr())

# Exploratory Data Analysis

In [None]:
df['Accident_Date'] = pd.to_datetime(df['Accident_Date'])

# Group the data by 'Accident_Date' and sum the 'No__of_Injuries' for each date
injuries_by_date = df.groupby('Accident_Date')['Injured'].sum()
dates_with_more_than_14_injuries = injuries_by_date[injuries_by_date > 14]
print(df[])
# Plot the graph
plt.figure(figsize=(12, 6))
plt.plot(injuries_by_date.index, injuries_by_date.values, color='blue', marker='o', linestyle='-')
plt.title('Number of Injuries Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Injuries')
plt.grid(True)
plt.xticks(rotation=45)
plt.yticks(range(0, max(injuries_by_date.values)+1))
plt.tight_layout()
plt.show()

In [None]:
filtered_df = df[df['Accident_Date'].isin(dates_with_more_than_14_injuries.index)]

# Select the desired columns for tabulation
selected_columns = ['Accident_Date', 'No__of_Vehicles', 'No__of_Bicycles', 'No__of_Motorcycles', 'No__of_Pedestrians']

# Tabulate the selected data
tabulated_data = filtered_df[selected_columns]

# Print the tabulated data
print(tabulated_data)

In [None]:
df.groupby('Road_Surface_Condition')['Injured'].sum()

In [None]:
wet_condition_df = df[df['Road_Surface_Condition'] == '02 - Wet']

# Count the number of rows where 'No__of_Injuries' is zero and positive
zero_injuries_count = (wet_condition_df['No__of_Injuries'] == 0).sum()
positive_injuries_count = (wet_condition_df['No__of_Injuries'] > 0).sum()
print(zero_injuries_count, positive_injuries_count)

2336 452


In [None]:

# Initialize lists to store the counts
conditions = []
zero_injuries_counts = []
positive_injuries_counts = []
ratio_counts = []
# Loop through each unique road surface condition
for condition in df['Road_Surface_Condition'].unique():
    # Filter the DataFrame for the current road surface condition
    condition_df = df[df['Road_Surface_Condition'] == condition]

    # Count the number of rows where 'No__of_Injuries' is zero and positive
    zero_injuries_count = (condition_df['No__of_Injuries'] == 0).sum()
    positive_injuries_count = (condition_df['No__of_Injuries'] > 0).sum()

    # Append the counts and condition to the lists
    conditions.append(condition)
    zero_injuries_counts.append(zero_injuries_count)
    positive_injuries_counts.append(positive_injuries_count)
    ratio_counts.append(positive_injuries_count/zero_injuries_count)

# Create a new DataFrame to store the tabular data
tabular_data = pd.DataFrame({
    'Road_Surface_Condition': conditions,
    'Zero_Injuries_Count': zero_injuries_counts,
    'Positive_Injuries_Count': positive_injuries_counts,
    'Ratio': ratio_counts
})

# Print the tabular data
print(tabular_data)

# Label Encoding

In [None]:
df.info()
# Encode categorical variables
label_encoders = {}
categorical_columns = ['Accident_Location', 'Classification_of_Accident', 'Initial_Impact_Type', 'Environment_Condition', 'Light',
                        'Road_Surface_Condition', 'Traffic_Control', 'Traffic_Control_Condition']

for column in categorical_columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# One Hot Encoding

In [9]:
categorical_columns = ['Accident_Location', 'Classification_of_Accident', 'Initial_Impact_Type', 'Environment_Condition', 'Light',
                        'Road_Surface_Condition', 'Traffic_Control', 'Traffic_Control_Condition']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)


# Feature Selection

In [None]:
#RFE, SelectKBest

# Dataset Split

In [19]:
# Split the data into training and testing sets
X = df.drop(columns=['Injured','No__of_Injuries','ObjectId'])
y = df['Injured']

print(y.value_counts())
# Oversamplinh
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

0    13678
1     2721
Name: Injured, dtype: int64


In [27]:
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit and transform the data
X_scaled = scaler.fit_transform(X)

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=69
)
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=0.5, random_state=69
)

In [29]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(X_val.shape)
print(y_val.shape)

(19149, 59)
(19149,)
(4103, 59)
(4103,)
(4104, 59)
(4104,)


# KNN Model Training

In [30]:
knn_model = KNeighborsClassifier()#n_neighbors=3)
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)

# Evaluate model
print("KNN Model:")
print("Accuracy:", accuracy_score(y_test, knn_predictions))
print("Classification Report:\n", classification_report(y_test, knn_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, knn_predictions))

knn_val_predictions = knn_model.predict(X_val)
print("Validation Set Results:")
print("Accuracy:", accuracy_score(y_val, knn_val_predictions))
print("Classification Report:\n", classification_report(y_val, knn_val_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_val, knn_val_predictions))


KNN Model:
Accuracy: 0.9992688276870583
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2061
           1       1.00      1.00      1.00      2042

    accuracy                           1.00      4103
   macro avg       1.00      1.00      1.00      4103
weighted avg       1.00      1.00      1.00      4103

Confusion Matrix:
 [[2061    0]
 [   3 2039]]
Validation Set Results:
Accuracy: 0.9975633528265108
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2058
           1       1.00      1.00      1.00      2046

    accuracy                           1.00      4104
   macro avg       1.00      1.00      1.00      4104
weighted avg       1.00      1.00      1.00      4104

Confusion Matrix:
 [[2057    1]
 [   9 2037]]


# Decision Tree Model Training

In [26]:
# Decision Tree Model
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)
dt_model.fit(X_train, y_train)
dt_predictions = dt_model.predict(X_test)

# Evaluate model
print("\nDecision Tree Model:")
print("Accuracy:", accuracy_score(y_test, dt_predictions))
print("Classification Report:\n", classification_report(y_test, dt_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, dt_predictions))

dt_val_predictions = dt_model.predict(X_val)
print("Validation Set Results:")
print("Accuracy:", accuracy_score(y_val, dt_val_predictions))
print("Classification Report:\n", classification_report(y_val, dt_val_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_val, dt_val_predictions))



Decision Tree Model:
Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2061
           1       1.00      1.00      1.00      2042

    accuracy                           1.00      4103
   macro avg       1.00      1.00      1.00      4103
weighted avg       1.00      1.00      1.00      4103

Confusion Matrix:
 [[2061    0]
 [   0 2042]]
Validation Set Results:
Accuracy: 0.9997563352826511
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2058
           1       1.00      1.00      1.00      2046

    accuracy                           1.00      4104
   macro avg       1.00      1.00      1.00      4104
weighted avg       1.00      1.00      1.00      4104

Confusion Matrix:
 [[2057    1]
 [   0 2046]]


In [17]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27356 entries, 0 to 27355
Data columns (total 60 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   No__of_Vehicles                                    27356 non-null  int64  
 1   No__of_Bicycles                                    27356 non-null  int64  
 2   No__of_Motorcycles                                 27356 non-null  int64  
 3   No__of_Pedestrians                                 27356 non-null  int64  
 4   X                                                  27356 non-null  float64
 5   Y                                                  27356 non-null  float64
 6   Latitude                                           27356 non-null  float64
 7   Longitude                                          27356 non-null  float64
 8   ObjectId                                           27356 non-null  int64  
 9   Accide