In [None]:
pip install catboost

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time
from google.colab import drive

In [None]:
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Capstone/neiss_2022.csv"
neiss = pd.read_csv(path)

Mounted at /content/drive


## CatBoost

### Diagnosis

In [None]:
X_diag_lgb = neiss.drop(['CPSC_Case_Number','Other_Race', 'Diagnosis','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date'], axis=1)
y_diag_lgb = neiss['Diagnosis']

In [None]:
# Splitting data for `diagnosis`
X_train_diag_lgb, X_test_diag_lgb, y_train_diag_lgb, y_test_diag_lgb = train_test_split(X_diag_lgb, y_diag_lgb, test_size=0.2, random_state=42)

In [None]:
# Initialize CatBoost Classifier
catboost_model_diag = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=5, verbose=0)

In [None]:
# Fit the model
catboost_model_diag.fit(X_train_diag_lgb, y_train_diag_lgb)

<catboost.core.CatBoostClassifier at 0x7f25fdb85570>

In [None]:
# Predict and evaluate
y_pred_diag_cat = catboost_model_diag.predict(X_test_diag_lgb)
accuracy_diag_cat = accuracy_score(y_test_diag_lgb, y_pred_diag_cat)
print(f'CatBoost Model Accuracy: {accuracy_diag_cat}')

CatBoost Model Accuracy: 0.5464132737478544


### Hyperparameter Tuning

In [None]:
catboost_param_grid = {
    'learning_rate': np.linspace(0.01, 0.2, 10),
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 9]
}

In [None]:
catboost_random_search = RandomizedSearchCV(
    CatBoostClassifier(),
    catboost_param_grid,
    n_iter=10,
    cv=3,
    verbose=1,
    n_jobs=-1
)

In [None]:
start_time = time.time()
catboost_random_search.fit(X_train_diag_lgb, y_train_diag_lgb)
print('Fit time : ', time.time() - start_time)

Fitting 3 folds for each of 10 candidates, totalling 30 fits




0:	learn: 2.4648521	total: 2.38s	remaining: 39m 40s
1:	learn: 2.2179171	total: 4.87s	remaining: 40m 31s
2:	learn: 2.0810451	total: 7.51s	remaining: 41m 35s
3:	learn: 1.9769448	total: 9.08s	remaining: 37m 40s
4:	learn: 1.9008774	total: 10.5s	remaining: 34m 52s
5:	learn: 1.8416932	total: 12s	remaining: 33m 3s
6:	learn: 1.7869711	total: 13.4s	remaining: 31m 40s
7:	learn: 1.7367359	total: 14.9s	remaining: 30m 41s
8:	learn: 1.6966546	total: 16.3s	remaining: 29m 55s
9:	learn: 1.6694193	total: 17.8s	remaining: 29m 19s
10:	learn: 1.6417362	total: 20.2s	remaining: 30m 16s
11:	learn: 1.6210571	total: 22.6s	remaining: 30m 58s
12:	learn: 1.6010175	total: 25s	remaining: 31m 34s
13:	learn: 1.5848683	total: 26.5s	remaining: 31m 6s
14:	learn: 1.5664666	total: 27.9s	remaining: 30m 33s
15:	learn: 1.5531168	total: 29.4s	remaining: 30m 6s
16:	learn: 1.5426969	total: 30.8s	remaining: 29m 41s
17:	learn: 1.5282077	total: 32.3s	remaining: 29m 20s
18:	learn: 1.5157012	total: 33.7s	remaining: 29m
19:	learn: 1.5

In [None]:
# Print the best parameters and their corresponding accuracy
print("Best parameters found: ", catboost_random_search.best_params_)
print("Accuracy of the best model: ", catboost_random_search.best_score_)

Best parameters found:  {'learning_rate': 0.1577777777777778, 'l2_leaf_reg': 7, 'depth': 6}
Accuracy of the best model:  0.594400679202144


### Feature Importance

In [None]:
X_diag_fi = neiss.drop(['CPSC_Case_Number','Other_Race', 'Diagnosis','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date','Drug','Product_3','Alcohol','Fire_Involvement'], axis=1)
y_diag_fi = neiss['Diagnosis']

In [None]:
# Splitting data for `diagnosis`
X_train_diag_fi, X_test_diag_fi, y_train_diag_fi, y_test_diag_fi = train_test_split(X_diag_fi, y_diag_fi, test_size=0.2, random_state=42)

In [None]:
catboost_model_diag_tuned = CatBoostClassifier(iterations=100, learning_rate=0.1577777777777778, depth=6, verbose=1, l2_leaf_reg=7)

In [None]:
# Fit the model
catboost_model_diag_tuned.fit(X_train_diag_fi, y_train_diag_fi)

0:	learn: 2.4676559	total: 1.55s	remaining: 2m 33s
1:	learn: 2.2104408	total: 2.76s	remaining: 2m 15s
2:	learn: 2.0779411	total: 3.95s	remaining: 2m 7s
3:	learn: 1.9823331	total: 5.16s	remaining: 2m 3s
4:	learn: 1.8977466	total: 6.38s	remaining: 2m 1s
5:	learn: 1.8244909	total: 7.58s	remaining: 1m 58s
6:	learn: 1.7742019	total: 8.79s	remaining: 1m 56s
7:	learn: 1.7336733	total: 10.4s	remaining: 1m 59s
8:	learn: 1.6986056	total: 12.5s	remaining: 2m 6s
9:	learn: 1.6686489	total: 14.5s	remaining: 2m 10s
10:	learn: 1.6385030	total: 16.4s	remaining: 2m 12s
11:	learn: 1.6161127	total: 17.7s	remaining: 2m 9s
12:	learn: 1.5958062	total: 18.9s	remaining: 2m 6s
13:	learn: 1.5806846	total: 20.1s	remaining: 2m 3s
14:	learn: 1.5634788	total: 21.3s	remaining: 2m
15:	learn: 1.5462572	total: 22.5s	remaining: 1m 57s
16:	learn: 1.5354413	total: 23.6s	remaining: 1m 55s
17:	learn: 1.5258833	total: 24.8s	remaining: 1m 53s
18:	learn: 1.5184532	total: 26s	remaining: 1m 50s
19:	learn: 1.5092684	total: 27.6s	r

<catboost.core.CatBoostClassifier at 0x7a3a701af9a0>

In [None]:
# Predict and evaluate
y_pred_diag_cat_tuned_fi = catboost_model_diag_tuned.predict(X_test_diag_fi)
accuracy_diag_cat_tuned_fi = accuracy_score(y_test_diag_fi, y_pred_diag_cat_tuned_fi)
print(f'CatBoost Model Accuracy: {accuracy_diag_cat_tuned_fi}')

CatBoost Model Accuracy: 0.5666393480647606


### Body Part

In [None]:
X_body_lgb = neiss.drop(['CPSC_Case_Number','Other_Race', 'Body_Part','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date'], axis=1)
y_body_lgb = neiss['Body_Part']

In [None]:
# Splitting data for `diagnosis`
X_train_body_lgb, X_test_body_lgb, y_train_body_lgb, y_test_body_lgb = train_test_split(X_body_lgb, y_body_lgb, test_size=0.2, random_state=42)

In [None]:
# Initialize CatBoost Classifier
catboost_model_body = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=5, verbose=0)

In [None]:
# Fit the model
catboost_model_body.fit(X_train_body_lgb, y_train_body_lgb)

<catboost.core.CatBoostClassifier at 0x7f25d30f3df0>

In [None]:
# Predict and evaluate
y_pred_body_cat = catboost_model_body.predict(X_test_body_lgb)
accuracy_body_cat = accuracy_score(y_test_body_lgb, y_pred_body_cat)
print(f'CatBoost Model Accuracy: {accuracy_body_cat}')

CatBoost Model Accuracy: 0.43888107130155096


### Tuned Parameters

In [None]:
catboost_model_body_tuned = CatBoostClassifier(iterations=100, learning_rate=0.1577777777777778, depth=6, verbose=1, l2_leaf_reg=7)

In [None]:
# Fit the model
catboost_model_body_tuned.fit(X_train_body_lgb, y_train_body_lgb)

0:	learn: 2.7033284	total: 2.28s	remaining: 3m 45s
1:	learn: 2.5549996	total: 4.42s	remaining: 3m 36s
2:	learn: 2.4648256	total: 5.68s	remaining: 3m 3s
3:	learn: 2.3975513	total: 6.92s	remaining: 2m 45s
4:	learn: 2.3428039	total: 8.16s	remaining: 2m 35s
5:	learn: 2.2939388	total: 9.44s	remaining: 2m 27s
6:	learn: 2.2500319	total: 10.7s	remaining: 2m 22s
7:	learn: 2.2280539	total: 12s	remaining: 2m 18s
8:	learn: 2.2020274	total: 13.3s	remaining: 2m 14s
9:	learn: 2.1699507	total: 14.6s	remaining: 2m 11s
10:	learn: 2.1432910	total: 16.9s	remaining: 2m 16s
11:	learn: 2.1255085	total: 19.1s	remaining: 2m 20s
12:	learn: 2.1091406	total: 21.3s	remaining: 2m 22s
13:	learn: 2.0910881	total: 22.6s	remaining: 2m 18s
14:	learn: 2.0827670	total: 23.8s	remaining: 2m 14s
15:	learn: 2.0646630	total: 25.1s	remaining: 2m 11s
16:	learn: 2.0558336	total: 26.3s	remaining: 2m 8s
17:	learn: 2.0479675	total: 27.6s	remaining: 2m 5s
18:	learn: 2.0349226	total: 28.9s	remaining: 2m 3s
19:	learn: 2.0230209	total: 

<catboost.core.CatBoostClassifier at 0x7a3a9af50fa0>

In [None]:
# Predict and evaluate
y_pred_body_cattuned = catboost_model_body_tuned.predict(X_test_body_lgb)
accuracy_body_cattuned = accuracy_score(y_test_body_lgb, y_pred_body_cattuned)
print(f'CatBoost Model Accuracy: {accuracy_body_cattuned}')

CatBoost Model Accuracy: 0.45221048725046004
