KNN benefits more from STANDARD SCALING!

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd


#KNN benefits more from using STANDARD_SCALER 
lifestyle_data = pd.read_csv('../datasets/lifestyle_dataset.csv')

# Split the 'Blood Pressure' column into 'Systolic_BP' and 'Diastolic_BP'
blood_pressure_split = lifestyle_data['Blood Pressure'].str.split('/', expand=True)
lifestyle_data['Systolic_BP'] = pd.to_numeric(blood_pressure_split[0], errors='coerce')
lifestyle_data['Diastolic_BP'] = pd.to_numeric(blood_pressure_split[1], errors='coerce')

# Drop the original 'Blood Pressure' column
lifestyle_data = lifestyle_data.drop(columns=['Blood Pressure'])

# Dropping additional columns from the lifestyle dataset
columns_to_remove = ['Hemisphere', 'Patient ID', 'Income', 'Continent', 'Country']
lifestyle_data = lifestyle_data.drop(columns=columns_to_remove)

# Preprocessing the lifestyle dataset by dropping the output column
Xlifestyle = lifestyle_data.drop('Heart Attack Risk', axis=1)
Ylifestyle = lifestyle_data['Heart Attack Risk']

# Identify categorical columns
categorical_columns = ['Sex']

# Identify numeric columns for scaling
numeric_columns = Xlifestyle.select_dtypes(include=['float64', 'int64']).columns.difference(['Systolic_BP', 'Diastolic_BP'])
blood_pressure_columns = ['Systolic_BP', 'Diastolic_BP']



In [4]:
# Split the data into training and testing sets (using the original Xlifestyle DataFrame)
X_train, X_test, Y_train, Y_test = train_test_split(Xlifestyle, Ylifestyle, test_size=0.2, random_state=42)

# Recreate the ColumnTransformer to be used in the pipeline
life_robust_scaled_preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),  # Standard scale selected numerical columns
        ('bp', StandardScaler(), blood_pressure_columns),
        ('cat', OneHotEncoder(), categorical_columns)  # One-hot encode categorical columns
    ]
)

# Define the KNN model
knn = KNeighborsClassifier()

# Create a pipeline that includes the preprocessor and the KNN classifier
pipeline = Pipeline(steps=[
    ('preprocessor', life_robust_scaled_preprocessor),
    ('knn', knn)
])

# Define the hyperparameters to tune
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 9, 11],           # Number of neighbors
    'knn__weights': ['uniform', 'distance'],        # Weighting function
    'knn__metric': ['euclidean', 'manhattan', 'minkowski']  # Distance metric
}

# Set up GridSearchCV with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, Y_train)

# Get the best parameters and the best score from GridSearchCV
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Cross-validation Accuracy: {:.2f}".format(grid_search.best_score_))

# Evaluate the model on the test set
Y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)

# Print the classification report
print("Test Accuracy: {:.2f}".format(accuracy))
print("Classification Report:\n", classification_report(Y_test, Y_pred))

Best Hyperparameters:  {'knn__metric': 'euclidean', 'knn__n_neighbors': 11, 'knn__weights': 'uniform'}
Best Cross-validation Accuracy: 0.61
Test Accuracy: 0.59
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.83      0.72      1125
           1       0.32      0.14      0.19       628

    accuracy                           0.59      1753
   macro avg       0.48      0.49      0.46      1753
weighted avg       0.52      0.59      0.53      1753

