In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('model_data.csv')
data.head()

Unnamed: 0,accident_reference,vehicle_reference,casualty_reference,casualty_class,sex_of_casualty,age_of_casualty,age_band_of_casualty,casualty_severity,casualty_type,casualty_home_area_type,casualty_imd_decile
0,70151244,2,1,Driver/Rider,Female,46.0,46 - 55,Slight,Car occupant,Urban area,9
1,70152668,1,1,Driver/Rider,Male,30.0,26 - 35,Slight,Car occupant,Urban area,2
2,70154696,1,1,Driver/Rider,Female,58.0,56 - 65,Slight,Car occupant,Urban area,10
3,70154696,2,3,Driver/Rider,Female,78.0,Over 75,Slight,Car occupant,Small town,10
4,70154696,3,2,Driver/Rider,Male,63.0,56 - 65,Slight,Car occupant,Rural,7


In [3]:
X = data.drop(columns=['accident_reference', 'vehicle_reference', 'casualty_reference',
               'casualty_severity'])

y = data[['casualty_severity']]

In [4]:
X = pd.get_dummies(X)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234)

In [6]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [7]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       Fatal       0.01      0.01      0.01       151
     Serious       0.28      0.22      0.25      2566
      Slight       0.82      0.86      0.84     10746

    accuracy                           0.73     13463
   macro avg       0.37      0.36      0.36     13463
weighted avg       0.71      0.73      0.72     13463



In [8]:
from sklearn.ensemble import RandomForestClassifier
rfc_model = RandomForestClassifier()
rfc_model.fit(X_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier()

In [9]:
print(classification_report(y_test, rfc_model.predict(X_test)))

              precision    recall  f1-score   support

       Fatal       0.00      0.00      0.00       151
     Serious       0.31      0.16      0.21      2566
      Slight       0.81      0.91      0.86     10746

    accuracy                           0.76     13463
   macro avg       0.37      0.36      0.36     13463
weighted avg       0.71      0.76      0.73     13463



In [12]:
! pip install imbalanced-learn



In [13]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)


In [14]:
dtc_model_resampled = DecisionTreeClassifier()
dtc_model_resampled.fit(X_resampled, y_resampled)
print(classification_report(y_test, dtc_model_resampled.predict(X_test)))

              precision    recall  f1-score   support

       Fatal       0.01      0.05      0.02       151
     Serious       0.23      0.36      0.28      2566
      Slight       0.82      0.65      0.73     10746

    accuracy                           0.59     13463
   macro avg       0.35      0.36      0.34     13463
weighted avg       0.70      0.59      0.63     13463



In [15]:
print(classification_report(y_train, dtc_model_resampled.predict(X_train)))

              precision    recall  f1-score   support

       Fatal       0.21      1.00      0.35       489
     Serious       0.47      0.81      0.60      7722
      Slight       0.95      0.74      0.83     32178

    accuracy                           0.75     40389
   macro avg       0.55      0.85      0.59     40389
weighted avg       0.85      0.75      0.78     40389



In [16]:
from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(X_resampled, y_resampled)
print(classification_report(y_test, svc_model.predict(X_test)))

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

       Fatal       0.02      0.41      0.05       151
     Serious       0.22      0.39      0.28      2566
      Slight       0.87      0.53      0.66     10746

    accuracy                           0.50     13463
   macro avg       0.37      0.44      0.33     13463
weighted avg       0.74      0.50      0.58     13463



In [17]:
print(classification_report(y_train, svc_model.predict(X_train)))

KeyboardInterrupt: 

In [19]:
import pickle
data_to_save = {
    'X': X,
    'y': y,
    'X_resampled': X_resampled,
    'y_resampled': y_resampled,
    'model': model,
    'rfc_model': rfc_model,
    # 'ros': ros,
    'dtc_model_resampled': dtc_model_resampled,
    'svc_model': svc_model,
}
with open('modeling_data.pkl', 'wb') as f:
    pickle.dump(data_to_save, f)

In [21]:
import joblib
joblib.dump(ros, 'over_sampler_model.pkl')

SyntaxError: invalid syntax (_metadata_requests.py, line 1492)

In [22]:
! pip install dill

Collecting dill
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m379.4 kB/s[0m eta [36m0:00:00[0m1m490.2 kB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.7


In [23]:
import dill

with open('oversampler.dill', 'wb') as file:
    dill.dump(ros, file)

SyntaxError: invalid syntax (_metadata_requests.py, line 1492)

In [25]:
! python --version

Python 3.7.0
