In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time
from google.colab import drive

In [None]:
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Capstone/neiss_2022.csv"
neiss = pd.read_csv(path)

Mounted at /content/drive


## LightGBM

### Diagnosis

In [None]:
X_diag_lgb = neiss.drop(['CPSC_Case_Number','Other_Race', 'Diagnosis','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date'], axis=1)
y_diag_lgb = neiss['Diagnosis']

In [None]:
# Splitting data for `diagnosis`
X_train_diag_lgb, X_test_diag_lgb, y_train_diag_lgb, y_test_diag_lgb = train_test_split(X_diag_lgb, y_diag_lgb, test_size=0.2, random_state=42)

In [None]:
# Create the LightGBM model
lgbm_model_diag = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05)

In [None]:
# Train the model
lgbm_model_diag.fit(X_train_diag_lgb, y_train_diag_lgb)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020702 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 861
[LightGBM] [Info] Number of data points in the train set: 258674, number of used features: 14
[LightGBM] [Info] Start training from score -4.427397
[LightGBM] [Info] Start training from score -6.894979
[LightGBM] [Info] Start training from score -8.336189
[LightGBM] [Info] Start training from score -8.186658
[LightGBM] [Info] Start training from score -4.938763
[LightGBM] [Info] Start training from score -6.566170
[LightGBM] [Info] Start training from score -6.246718
[LightGBM] [Info] Start training from score -4.855446
[LightGBM] [Info] Start training from score -3.920268
[LightGBM] [Info] Start training from score -2.232225
[LightGBM] [Info] Start training from score -6.149776
[LightGBM] [Info] Start training from score -4.147268

In [None]:
# Predict and evaluate
y_pred_diag_lgb = lgbm_model_diag.predict(X_test_diag_lgb)
accuracy_diag_lgb = accuracy_score(y_test_diag_lgb, y_pred_diag_lgb)
print(f'LightGBM Model Accuracy: {accuracy_diag_lgb}')

LightGBM Model Accuracy: 0.5776492600782446


### Hyperparameter Tuning

In [None]:
# Create the LightGBM model
lgbm_model_diag_tuning = lgb.LGBMClassifier()

In [None]:
# Define a minimal parameter grid
param_grid_lgbm_diag = {
    'n_estimators': [100, 200],  # Number of trees
    'learning_rate': [0.01, 0.1],  # Learning rate
    'max_depth': [3, 5]  # Maximum depth of trees
}

In [None]:
random_search_diag_lgbm = RandomizedSearchCV(lgbm_model_diag_tuning, param_distributions=param_grid_lgbm_diag, n_iter=2, cv=2, verbose=2)


In [None]:
start_time = time.time()
random_search_diag_lgbm.fit(X_train_diag_lgb, y_train_diag_lgb)
print('Fit time : ', time.time() - start_time)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Start training from score -4.873482
[LightGBM] [Info] Start training from score -2.421292
[LightGBM] [Info] Start training from score -5.837931
[LightGBM] [Info] Start training from score -6.096853
[LightGBM] [Info] Start training from score -7.339360
[LightGBM] [Info] Start training from score -4.265785
[LightGBM] [Info] Start training from score -6.739739
[LightGBM] [Info] Start training from score -1.661750
[LightGBM] [Info] Start training from score -4.864423
[LightGBM] [Info] Start training from score -7.781193
[LightGBM] [Info] Start training from score -5.002834
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=  41.3s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017806 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 854
[

In [None]:
# Print the best parameters and their corresponding accuracy
print("Best parameters found: ", random_search_diag_lgbm.best_params_)
print("Accuracy of the best model: ", random_search_diag_lgbm.best_score_)

Best parameters found:  {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.01}
Accuracy of the best model:  0.5158230050178989


### Body Part

In [None]:
X_body_lgb = neiss.drop(['CPSC_Case_Number','Other_Race', 'Body_Part','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date'], axis=1)
y_body_lgb = neiss['Body_Part']

In [None]:
# Splitting data for `diagnosis`
X_train_body_lgb, X_test_body_lgb, y_train_body_lgb, y_test_body_lgb = train_test_split(X_body_lgb, y_body_lgb, test_size=0.2, random_state=42)

In [None]:
# Create the LightGBM model
lgbm_model_body = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05)

In [None]:
# Train the model
lgbm_model_body.fit(X_train_body_lgb, y_train_body_lgb)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.040805 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 866
[LightGBM] [Info] Number of data points in the train set: 258674, number of used features: 14
[LightGBM] [Info] Start training from score -4.346011
[LightGBM] [Info] Start training from score -3.265259
[LightGBM] [Info] Start training from score -2.859861
[LightGBM] [Info] Start training from score -3.753694
[LightGBM] [Info] Start training from score -3.426028
[LightGBM] [Info] Start training from score -3.403458
[LightGBM] [Info] Start training from score -3.015937
[LightGBM] [Info] Start training from score -3.342033
[LightGBM] [Info] Start training from score -2.987470
[LightGBM] [Info] Start training from score -5.103220
[LightGBM] [Info] Start training from score -1.665791
[LightGBM] [Info] Start training from score -2.433027
[LightGBM] [Info] Start training from score -4.318064
[LightGBM

In [None]:
# Predict and evaluate
y_pred_body_lgb = lgbm_model_body.predict(X_test_body_lgb)
accuracy_body_lgb = accuracy_score(y_test_body_lgb, y_pred_body_lgb)
print(f'LightGBM Model Accuracy: {accuracy_body_lgb}')

LightGBM Model Accuracy: 0.4697614003618426


### Best Parameters

In [None]:
# Create the LightGBM model
lgbm_model_body_params = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.01, max_depth=6)

In [None]:
# Train the model
lgbm_model_body_params.fit(X_train_body_lgb, y_train_body_lgb)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.036030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 866
[LightGBM] [Info] Number of data points in the train set: 258674, number of used features: 14
[LightGBM] [Info] Start training from score -4.346011
[LightGBM] [Info] Start training from score -3.265259
[LightGBM] [Info] Start training from score -2.859861
[LightGBM] [Info] Start training from score -3.753694
[LightGBM] [Info] Start training from score -3.426028
[LightGBM] [Info] Start training from score -3.403458
[LightGBM] [Info] Start training from score -3.015937
[LightGBM] [Info] Start training from score -3.342033
[LightGBM] [Info] Start training from score -2.987470
[LightGBM] [Info] Start training from score -5.103220
[LightGBM] [Info] Start training from score -1.665791
[LightGBM] [Info] Start training from score -2.433027

In [None]:
# Predict and evaluate
y_pred_body_lgb_params = lgbm_model_body_params.predict(X_test_body_lgb)
accuracy_body_lgb_params = accuracy_score(y_test_body_lgb, y_pred_body_lgb_params)
print(f'LightGBM Model Accuracy: {accuracy_body_lgb_params}')

LightGBM Model Accuracy: 0.45050951769781505
