In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from google.colab import drive

In [None]:
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Capstone/neiss_2022.csv"
neiss = pd.read_csv(path)

Mounted at /content/drive


In [None]:
X_diag = neiss.drop(['CPSC_Case_Number','Other_Race', 'Diagnosis','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date'], axis=1)
X_body = neiss.drop(['CPSC_Case_Number','Other_Race', 'Body_Part','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date'], axis=1)
y_diag = neiss['Diagnosis']
y_body = neiss['Body_Part']

In [None]:
# Splitting data for `body_part`
X_train_body, X_test_body, y_train_body, y_test_body = train_test_split(X_body, y_body, test_size=0.2, random_state=42)

In [None]:
# Splitting data for `diagnosis`
X_train_diag, X_test_diag, y_train_diag, y_test_diag = train_test_split(X_diag, y_diag, test_size=0.2, random_state=42)

## Body Part - Random Forest

### Initial Model

In [None]:
# Training a Random Forest model for `body_part`
rf_body_part = RandomForestClassifier(random_state=42)
rf_body_part.fit(X_train_body, y_train_body)

In [None]:
rf_pred_body = rf_body_part.predict(X_test_body)
print(accuracy_score(y_test_body, rf_pred_body))
print(classification_report(y_test_body, rf_pred_body))

0.41533037467720235
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       859
          30       0.19      0.17      0.18      2394
          31       0.19      0.21      0.20      3718
          32       0.23      0.18      0.20      1478
          33       0.18      0.15      0.17      2102
          34       0.16      0.14      0.15      2047
          35       0.18      0.17      0.17      3296
          36       0.14      0.10      0.12      2322
          37       0.33      0.40      0.36      3206
          38       0.12      0.06      0.08       375
          75       0.82      0.84      0.83     12280
          76       0.37      0.49      0.42      5704
          77       0.38      0.32      0.34       856
          79       0.30      0.35      0.32      4892
          80       0.12      0.09      0.10       703
          81       0.10      0.06      0.07       816
          82       0.26      0.22      0.23      2559
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Body Part - Feature Importances

In [None]:
# Get feature importances
importances = rf_body_part.feature_importances_

# Get the feature names
feature_names = np.array(X_train_body.columns)

In [None]:
# Create a DataFrame to hold the feature names and their importance scores
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sort the DataFrame based on the importance scores
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [None]:
# Display the feature importances
print(feature_importance_df)

             Feature  Importance
0                Age    0.235984
4          Diagnosis    0.233073
8          Product_1    0.207853
13            Weight    0.117807
2               Race    0.051608
6           Location    0.040607
9          Product_2    0.034299
3           Hispanic    0.030464
1                Sex    0.017942
5        Disposition    0.014625
12              Drug    0.006290
10         Product_3    0.004365
11           Alcohol    0.003003
7   Fire_Involvement    0.002079


In [None]:
# X, y after feature importance. Removing less than .01. Those features are Drug, Product_3, Alcohol, and Fire_Involvement
X_body = neiss.drop(['CPSC_Case_Number','Other_Race', 'Body_Part','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date','Drug','Product_3','Alcohol','Fire_Involvement'], axis=1)
y_body = neiss['Body_Part']

In [None]:
# Splitting data for `body_part`
X_train_body, X_test_body, y_train_body, y_test_body = train_test_split(X_body, y_body, test_size=0.2, random_state=42)

In [None]:
# Splitting data for `diagnosis`
X_train_diag, X_test_diag, y_train_diag, y_test_diag = train_test_split(X_diag, y_diag, test_size=0.2, random_state=42)

### Body Part - Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
# Create the model
rf_b_gs = RandomForestClassifier(random_state=42)

In [None]:
# Instantiate the grid search model
grid_search_fi = GridSearchCV(estimator=rf_b_gs, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

In [None]:
# Fit the grid search to the data
grid_search_fi.fit(X_train_body, y_train_body)

Fitting 3 folds for each of 108 candidates, totalling 324 fits




In [None]:
# Get the best parameters
best_params_fi = grid_search_fi.best_params_
print("Best parameters found: ", best_params_fi)

Best parameters found:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}


In [None]:
# Use the best model
best_model_fi = grid_search_fi.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
# Make predictions with the best model
y_pred_body_fi = best_model_fi.predict(X_test_body)

In [None]:
# Calculate accuracy
accuracy_body = accuracy_score(y_test_body, y_pred_body_fi)
print('Body Part Prediction Accuracy:', accuracy_body)

Body Part Prediction Accuracy: 0.4649059054570196


### Best Params with Feature Importance

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf_b_gs, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train_body, y_train_body)

Fitting 3 folds for each of 108 candidates, totalling 324 fits




In [None]:
# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

Best parameters found:  {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 300}


In [None]:
# Use the best model
best_model = grid_search.best_estimator_

In [None]:
# Make predictions with the best model
y_pred_body = best_model.predict(X_test_body)

In [None]:
# Calculate accuracy
accuracy_body = accuracy_score(y_test_body, y_pred_body)
print('Body Part Prediction Accuracy:', accuracy_body)

Body Part Prediction Accuracy: 0.45462277134330203


### Pickle the models

In [None]:
from joblib import dump
dump(best_params, '/content/drive/MyDrive/Capstone/grid_search_rf_best_params.pkl')

['/content/drive/MyDrive/Capstone/grid_search_rf_best_params.pkl']

In [None]:
from joblib import dump
dump(grid_search, '/content/drive/MyDrive/Capstone/grid_search_rf_body.pkl')

['/content/drive/MyDrive/Capstone/grid_search_rf_body.pkl']

## Diagnosis - Random Forest

### Initial Model

In [None]:
X_diag = neiss.drop(['CPSC_Case_Number','Other_Race', 'Diagnosis','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date'], axis=1)

In [None]:
y_diag = neiss['Diagnosis']

In [None]:
# Splitting data for `diagnosis`
X_train_diag, X_test_diag, y_train_diag, y_test_diag = train_test_split(X_diag, y_diag, test_size=0.2, random_state=42)

In [None]:
# Training a Random Forest model for `body_part`
rf_diag = RandomForestClassifier(random_state=42)
rf_diag.fit(X_train_diag, y_train_diag)

In [None]:
rf_pred_diag = rf_diag.predict(X_test_diag)
print(accuracy_score(y_test_diag, rf_pred_diag))
print(classification_report(y_test_diag, rf_pred_diag))

0.542037142989686
              precision    recall  f1-score   support

          41       0.93      0.99      0.96       794
          42       0.60      0.09      0.16        65
          46       0.67      0.12      0.20        17
          47       0.00      0.00      0.00        22
          48       0.74      0.42      0.53       427
          49       0.50      0.14      0.22        91
          50       0.26      0.13      0.17       142
          51       0.73      0.52      0.60       481
          52       0.39      0.23      0.29      1307
          53       0.30      0.22      0.26      7022
          54       0.12      0.08      0.10       116
          55       0.37      0.23      0.29      1024
          56       0.72      0.58      0.64      1442
          57       0.52      0.59      0.56     10663
          58       0.11      0.03      0.05       896
          59       0.57      0.64      0.60     10174
          60       0.44      0.23      0.30       345
         

### Diagnosis - Feature Importances

In [None]:
# Get feature importances
importances_diag = rf_diag.feature_importances_

# Get the feature names
feature_names_diag = np.array(X_train_diag.columns)

In [None]:
# Create a DataFrame to hold the feature names and their importance scores
feature_importance_df_diag = pd.DataFrame({'Feature': feature_names_diag, 'Importance': importances_diag})

# Sort the DataFrame based on the importance scores
feature_importance_df_diag = feature_importance_df_diag.sort_values(by='Importance', ascending=False)

In [None]:
# Display the feature importances
print(feature_importance_df_diag)

             Feature  Importance
4          Body_Part    0.272819
0                Age    0.216954
8          Product_1    0.207762
13            Weight    0.103233
6           Location    0.042431
2               Race    0.038745
9          Product_2    0.034072
3           Hispanic    0.026922
5        Disposition    0.021735
1                Sex    0.015290
12              Drug    0.007515
10         Product_3    0.004452
7   Fire_Involvement    0.004309
11           Alcohol    0.003760


In [None]:
# X, y after feature importance. Removing less than .01. Those features are Drug, Product_3, Alcohol, and Fire_Involvement
X_diag_fi = neiss.drop(['CPSC_Case_Number','Other_Race', 'Diagnosis','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date','Drug','Product_3','Alcohol','Fire_Involvement'], axis=1)
y_diag_fi = neiss['Diagnosis']

In [None]:
# Splitting data for `diagnosis`
X_train_diag_fi, X_test_diag_fi, y_train_diag_fi, y_test_diag_fi = train_test_split(X_diag_fi, y_diag_fi, test_size=0.2, random_state=42)

In [None]:
# Create the model
rf_d_gs = RandomForestClassifier(random_state=42)

In [None]:
rf_d_gs.fit(X_train_diag_fi, y_train_diag_fi)

In [None]:
rf_pred_diag_fi = rf_d_gs.predict(X_test_diag_fi)
print(accuracy_score(y_test_diag_fi, rf_pred_diag_fi))
print(classification_report(y_test_diag_fi, rf_pred_diag_fi))

0.545872056162922
              precision    recall  f1-score   support

          41       0.93      0.99      0.96       794
          42       0.64      0.14      0.23        65
          46       0.67      0.12      0.20        17
          47       0.00      0.00      0.00        22
          48       0.75      0.49      0.59       427
          49       0.52      0.18      0.26        91
          50       0.30      0.14      0.19       142
          51       0.56      0.30      0.39       481
          52       0.39      0.22      0.28      1307
          53       0.31      0.24      0.27      7022
          54       0.14      0.08      0.10       116
          55       0.38      0.27      0.31      1024
          56       0.72      0.60      0.66      1442
          57       0.53      0.60      0.56     10663
          58       0.09      0.03      0.04       896
          59       0.59      0.65      0.62     10174
          60       0.42      0.28      0.34       345
         

### Diagnosis - Best Parameters

In [None]:
rf_d_bestparams = RandomForestClassifier(random_state=42, max_depth=20, min_samples_leaf=1, min_samples_split=10, n_estimators=300)

In [None]:
rf_d_bestparams.fit(X_train_diag_fi, y_train_diag_fi)

In [None]:
rf_pred_diag_bp = rf_d_bestparams.predict(X_test_diag_fi)
print(accuracy_score(y_test_diag_fi, rf_pred_diag_bp))
print(classification_report(y_test_diag_fi, rf_pred_diag_bp))

0.5753761462215281
              precision    recall  f1-score   support

          41       0.93      1.00      0.96       794
          42       0.57      0.06      0.11        65
          46       0.00      0.00      0.00        17
          47       0.00      0.00      0.00        22
          48       0.80      0.45      0.58       427
          49       0.56      0.10      0.17        91
          50       0.49      0.13      0.20       142
          51       0.67      0.25      0.36       481
          52       0.53      0.18      0.27      1307
          53       0.39      0.18      0.25      7022
          54       0.33      0.04      0.08       116
          55       0.59      0.19      0.29      1024
          56       0.78      0.58      0.66      1442
          57       0.55      0.67      0.60     10663
          58       0.47      0.01      0.02       896
          59       0.59      0.69      0.64     10174
          60       0.53      0.16      0.24       345
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
