# Heart Classification with Hypertrophic Cardiomyopathy

This notebook demonstrates a **K-Nearest Neighbors (KNN) classifier** applied to classify cardiomegaly based on heart and lung features. We include **data preprocessing**, **hyperparameter tuning**, **cross-validation**, and **model evaluation metrics**.

## 1. Importing libraries

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Classification models
from sklearn.neighbors import KNeighborsClassifier

# Evaluation metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix, classification_report

## 2. Load and preprocess data

In [ ]:
# Load CSV file
data = pd.read_csv("task_data.csv")
data.columns = data.columns.str.strip()  # Remove whitespace from column names

numeric_cols = [
    "Heart width", "Lung width", "CTR - Cardiothoracic Ratio", "xx", "yy", "xy", "normalized_diff",
    "Inscribed circle radius", "Polygon Area Ratio", "Heart perimeter", "Heart area", "Lung area"
]

# Convert numeric columns to float
for col in numeric_cols:
    data[col] = data[col].astype(str).str.replace(",", ".", regex=True).astype(float)

# Define features and target
X = data[numeric_cols]
y = data["Cardiomegaly"]

## 3. Split dataset

In [ ]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply standard scaling
scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.transform(X_test)

## 4. Hyperparameter tuning with GridSearchCV

In [ ]:
# Define hyperparameter grid
param_grid = {
    "model__n_neighbors": [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
    "model__weights": ["uniform", "distance"],
    "model__metric": ["minkowski", "manhattan", "euclidean"],
    "model__p": [1, 2]
}

# Define cross-validation strategy
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)

# Create KNN pipeline
pipe_knn = Pipeline([
    ("scaler", StandardScaler()),
    ("model", KNeighborsClassifier())
])

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=pipe_knn,
    param_grid=param_grid,
    scoring="accuracy",
    cv=rskf,
    verbose=1,
    n_jobs=-1
)

# Train the model
grid_search.fit(X_train, y_train)

# Display best parameters and CV score
print("Best hyperparameters:", grid_search.best_params_)
print("Best CV accuracy:", round(grid_search.best_score_, 4))

## 5. Evaluate on test set

In [ ]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Test Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("F1 Score:", round(f1_score(y_test, y_pred), 4))
print("ROC AUC Score:", round(roc_auc_score(y_test, y_pred), 4))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## 6. Cross-validation on training data

In [ ]:
cv_score = cross_val_score(pipe_knn, X_train, y_train, cv=rskf)
print("Cross-validation scores (training data):")
print(np.round(cv_score, 2))
print("Mean CV score:", round(np.mean(cv_score), 3))
print("Std CV score:", round(np.std(cv_score), 3))

## 7. Optional: ROC Curve visualization

In [ ]:
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0,1], [0,1], color='gray', lw=1, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KNN Classifier')
plt.legend(loc='lower right')
plt.show()