In [1]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
# from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# Read data / csv files
a_file_path = './Correlation/regular_corr.csv'
b_file_path = './Correlation/rmv_out_corr.csv'
d_file_path = './Correlation/normali_corr.csv'

a_df = pd.read_csv(a_file_path)
b_df = pd.read_csv(b_file_path)
d_df = pd.read_csv(d_file_path)

In [3]:
# Train Test Split
a_X = a_df.drop('diabetes', axis=1)
a_y = a_df['diabetes']
b_X = b_df.drop('diabetes', axis=1)
b_y = b_df['diabetes']
d_X = d_df.drop('diabetes', axis=1)
d_y = d_df['diabetes']

a_X_train, a_X_test, a_y_train, a_y_test = train_test_split(a_X, a_y, test_size=0.2, random_state=1)
# a_X_train, a_X_validation, a_y_train, a_y_validation = train_test_split(a_X_train, a_y_train, test_size=0.25, random_state=1)
b_X_train, b_X_test, b_y_train, b_y_test = train_test_split(b_X, b_y, test_size=0.2, random_state=1)
# b_X_train, b_X_validation, b_y_train, b_y_validation = train_test_split(b_X_train, b_y_train, test_size=0.25, random_state=1)
d_X_train, d_X_test, d_y_train, d_y_test = train_test_split(d_X, d_y, test_size=0.2, random_state=1)
# d_X_train, d_X_validation, d_y_train, d_y_validation = train_test_split(d_X_train, d_y_train, test_size=0.25, random_state=1)

In [4]:
# Regular
a_model = knn()
a_model.fit(a_X_train, a_y_train)

print(f'>> done fitting')

# Uncomment if you want to see the list of available hyperparameters
parameter = a_model.get_params()
for key, value in parameter.items():
    print(f"{key}: {value}")

# param_grid = {'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7], 'n_estimators': [50, 100, 150]}
param_grid = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'p': [1, 2]}
a_grid_search = GridSearchCV(knn(), param_grid, cv=5)
a_grid_search.fit(a_X_train, a_y_train)
a_best_params = a_grid_search.best_params_

print(f'Regular best param : {a_best_params}')

a_best_model = knn(**a_best_params)
a_best_model.fit(a_X_train, a_y_train)
a_y_pred = a_best_model.predict(a_X_test)

print("Regular Classification Report:\n", classification_report(a_y_test, a_y_pred))
print("Regular Confusion Matrix:\n", confusion_matrix(a_y_test, a_y_pred))

>> done fitting
algorithm: auto
leaf_size: 30
metric: minkowski
metric_params: None
n_jobs: None
n_neighbors: 5
p: 2
weights: uniform
Regular best param : {'n_neighbors': 5, 'p': 1, 'weights': 'distance'}
Regular Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.93      0.88      2549
           1       0.67      0.41      0.51       844

    accuracy                           0.80      3393
   macro avg       0.75      0.67      0.69      3393
weighted avg       0.79      0.80      0.79      3393

Regular Confusion Matrix:
 [[2381  168]
 [ 500  344]]


In [5]:
# Regular
b_model = knn()
b_model.fit(b_X_train, b_y_train)

print(f'>> done fitting')

# Uncomment if you want to see the list of available hyperparameters
parameter = b_model.get_params()
for key, value in parameter.items():
    print(f"{key}: {value}")

# param_grid = {'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7], 'n_estimators': [50, 100, 150]}
param_grid = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'p': [1, 2]}
b_grid_search = GridSearchCV(knn(), param_grid, cv=5)
b_grid_search.fit(b_X_train, b_y_train)
b_best_params = b_grid_search.best_params_

print(f'Regular best param : {b_best_params}')

b_best_model = knn(**b_best_params)
b_best_model.fit(b_X_train, b_y_train)
b_y_pred = b_best_model.predict(b_X_test)

print("Regular Classification Report:\n", classification_report(b_y_test, b_y_pred))
print("Regular Confusion Matrix:\n", confusion_matrix(b_y_test, b_y_pred))

>> done fitting
algorithm: auto
leaf_size: 30
metric: minkowski
metric_params: None
n_jobs: None
n_neighbors: 5
p: 2
weights: uniform
Regular best param : {'n_neighbors': 7, 'p': 2, 'weights': 'uniform'}
Regular Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.99      0.92      2454
           1       0.26      0.03      0.06       374

    accuracy                           0.86      2828
   macro avg       0.56      0.51      0.49      2828
weighted avg       0.79      0.86      0.81      2828

Regular Confusion Matrix:
 [[2419   35]
 [ 362   12]]


In [6]:
# Regular
d_model = knn()
d_model.fit(d_X_train, d_y_train)

print(f'>> done fitting')

# Uncomment if you want to see the list of available hyperparameters
parameter = d_model.get_params()
for key, value in parameter.items():
    print(f"{key}: {value}")

# param_grid = {'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7], 'n_estimators': [50, 100, 150]}
param_grid = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'p': [1, 2]}
d_grid_search = GridSearchCV(knn(), param_grid, cv=5)
d_grid_search.fit(d_X_train, d_y_train)
d_best_params = d_grid_search.best_params_

print(f'Regular best param : {d_best_params}')

d_best_model = knn(**d_best_params)
d_best_model.fit(d_X_train, d_y_train)
d_y_pred = d_best_model.predict(d_X_test)

print("Regular Classification Report:\n", classification_report(d_y_test, d_y_pred))
print("Regular Confusion Matrix:\n", confusion_matrix(d_y_test, d_y_pred))

>> done fitting
algorithm: auto
leaf_size: 30
metric: minkowski
metric_params: None
n_jobs: None
n_neighbors: 5
p: 2
weights: uniform
Regular best param : {'n_neighbors': 7, 'p': 1, 'weights': 'uniform'}
Regular Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.99      0.93      2454
           1       0.16      0.01      0.02       374

    accuracy                           0.86      2828
   macro avg       0.51      0.50      0.47      2828
weighted avg       0.77      0.86      0.81      2828

Regular Confusion Matrix:
 [[2433   21]
 [ 370    4]]
