In [None]:
import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Capstone/neiss_2022.csv"
neiss = pd.read_csv(path)

Mounted at /content/drive


## Diagnosis - KNN

### Initial Model

In [None]:
X_diag = neiss.drop(['CPSC_Case_Number','Other_Race', 'Diagnosis','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date'], axis=1)
y_diag = neiss['Diagnosis']

In [None]:
# Splitting data for `diagnosis`
X_train_diag, X_test_diag, y_train_diag, y_test_diag = train_test_split(X_diag, y_diag, test_size=0.2, random_state=42)

In [None]:
# Initialize the KNN model
model_knn_diag = KNeighborsClassifier(n_neighbors=5)

In [None]:
# Fit the model on training data
model_knn_diag.fit(X_train_diag, y_train_diag)

In [None]:
# Make predictions
y_pred_diag = model_knn_diag.predict(X_test_diag)

In [None]:
# Evaluate the model
accuracy_knn_diag = accuracy_score(y_test_diag, y_pred_diag)
print(f'Accuracy of KNN Classifier: {accuracy_knn_diag}')

Accuracy of KNN Classifier: 0.4263712134098254


### Feature Importance

In [None]:
X_diag_fi = neiss.drop(['CPSC_Case_Number','Other_Race', 'Diagnosis','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date','Drug','Product_3','Alcohol','Fire_Involvement'], axis=1)
y_diag_fi = neiss['Diagnosis']

In [None]:
# Splitting data for `diagnosis`
X_train_diag_fi, X_test_diag_fi, y_train_diag_fi, y_test_diag_fi = train_test_split(X_diag_fi, y_diag_fi, test_size=0.2, random_state=42)

### Best Params

In [None]:
# Define the parameter grid
param_grid = {
    'knn__n_neighbors': range(1, 31),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

In [None]:
# Create a pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),  # It's important to scale features for KNN
    ('knn', KNeighborsClassifier())
])

In [None]:
# Initialize the GridSearchCV object
grid_search_diag = GridSearchCV(pipe, param_grid, cv=5, verbose=1, n_jobs=-1)

In [None]:
# Fit it to the data
grid_search_diag.fit(X_train_diag_fi, y_train_diag_fi)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


In [None]:
# The best combination of parameters
best_params_diag = grid_search_diag.best_params_

In [None]:
# The best score achieved with the best parameters
best_score_diag = grid_search_diag.best_score_

In [None]:
print(f"Best parameters: {best_params_diag}")
print(f"Best cross-validation score: {best_score_diag}")

Best parameters: {'knn__metric': 'manhattan', 'knn__n_neighbors': 29, 'knn__weights': 'distance'}
Best cross-validation score: 0.4384514916949076


## Body Part - KNN

### Initial Model

In [None]:
X_body = neiss.drop(['CPSC_Case_Number','Other_Race', 'Body_Part','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date'], axis=1)
y_body = neiss['Body_Part']

In [None]:
# Splitting data for `body_part`
X_train_body, X_test_body, y_train_body, y_test_body = train_test_split(X_body, y_body, test_size=0.2, random_state=42)

In [None]:
# Initialize the KNN model
model_knn_body = KNeighborsClassifier(n_neighbors=5)

In [None]:
# Fit the model on training data
model_knn_body.fit(X_train_body, y_train_body)

In [None]:
# Make predictions
y_pred_body = model_knn_body.predict(X_test_body)

In [None]:
# Evaluate the model
accuracy_knn_body = accuracy_score(y_test_body, y_pred_body)
print(f'Accuracy of KNN Classifier: {accuracy_knn_body}')

Accuracy of KNN Classifier: 0.3326477910590855


### Feature Importance

In [None]:
# Drop the irrelevant features
X_body_fi = neiss.drop(['CPSC_Case_Number','Other_Race', 'Body_Part','Other_Diagnosis', 'Body_Part_2', 'Diagnosis_2', 'Other_Diagnosis_2', 'PSU', 'Stratum', 'Narrative_1','Treatment_Date', 'Product_3','Drug', 'Alcohol', 'Fire_Involvement'], axis=1)
y_body_fi = neiss['Body_Part']

In [None]:
# Splitting data for `body_part`
X_train_body_fi, X_test_body_fi, y_train_body_fi, y_test_body_fi = train_test_split(X_body_fi, y_body_fi, test_size=0.2, random_state=42)

### Best Params

In [None]:
# Best parameters found by GridSearchCV
best_params = {'metric': 'manhattan', 'n_neighbors': 29, 'weights': 'distance'}

In [None]:
# Apply the best parameters to the model
knn_best = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'],
                                weights=best_params['weights'],
                                metric=best_params['metric'])

In [None]:
knn_best.fit(X_train_body_fi, y_train_body_fi)

In [None]:
# And then you can make predictions and evaluate the model
y_pred_body_fi = knn_best.predict(X_test_body_fi)

In [None]:
# Evaluate the model
accuracy_knn_body_fi = accuracy_score(y_test_body_fi, y_pred_body_fi)
print(f'Accuracy of KNN Classifier: {accuracy_knn_body_fi}')

Accuracy of KNN Classifier: 0.37085775255532016
