In [1]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
event_df = pd.read_csv("/Users/armaansidhu/Documents/Uni/year-2/EODP/code/proj_2/code/event_df.csv")

# ---------------------------
# Step 1: Handle missing values
# ---------------------------
event_df['DEG_URBAN_NAME'].fillna('Unknown', inplace=True)
event_df['ROAD_TYPE'].fillna('Unknown', inplace=True)

# ---------------------------
# Step 2: Convert string lists back to Python lists
# ---------------------------
event_df["ATMOSPH_COND"] = event_df["ATMOSPH_COND"].apply(ast.literal_eval)
event_df["SURFACE_COND"] = event_df["SURFACE_COND"].apply(ast.literal_eval)

# ---------------------------
# Step 3: Multi-hot encode list-based columns
# ---------------------------
for val in sorted({v for sub in event_df["ATMOSPH_COND"] for v in sub}):
    event_df[f"ATMOSPH_COND_{val}"] = event_df["ATMOSPH_COND"].apply(lambda x: int(val in x))

for val in sorted({v for sub in event_df["SURFACE_COND"] for v in sub}):
    event_df[f"SURFACE_COND_{val}"] = event_df["SURFACE_COND"].apply(lambda x: int(val in x))

# Drop unnecessary columns
event_df.drop(columns=["ATMOSPH_COND", "SURFACE_COND", "ACCIDENT_NO", "ACCIDENT_TIME", "POSTCODE_CRASH"], inplace=True)

# ---------------------------
# Step 4: One-hot encode categorical columns
# ---------------------------
categorical_cols = ["NODE_TYPE", "ROAD_TYPE", "DEG_URBAN_NAME"]
event_df = pd.get_dummies(event_df, columns=categorical_cols, drop_first=True)

# ---------------------------
# Step 5: Define features (X) and target (y)
# ---------------------------
X = event_df.drop(columns=["SEVERITY"])
y = event_df["SEVERITY"]

# ---------------------------
# Step 6: Feature scaling
# ---------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---------------------------
# Step 7: Train-test split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# ---------------------------
# Step 8: Train Decision Tree Classifier
# ---------------------------
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# ---------------------------
# Step 9: Make predictions and evaluate the model
# ---------------------------
y_pred = clf.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)

# Classification report
target_names = [str(i) for i in y.unique()]  # Dynamic target names to handle all classes
report = classification_report(y_test, y_pred, target_names=target_names)

# Output
print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  event_df['DEG_URBAN_NAME'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  event_df['ROAD_TYPE'].fillna('Unknown', inplace=True)


Accuracy: 0.5823022227198925

Classification Report:
               precision    recall  f1-score   support

           3       0.03      0.01      0.01       589
           2       0.41      0.30      0.35     12867
           1       0.65      0.76      0.70     22265
           4       0.00      0.00      0.00         1

    accuracy                           0.58     35722
   macro avg       0.27      0.27      0.27     35722
weighted avg       0.55      0.58      0.56     35722



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
