In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import mutual_info_classif

data = pd.read_csv('heart.csv')

In [3]:
# Separate features and target
X = data.drop('target', axis=1)
y = data['target']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Scale the features 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter Tuning with GridSearchCV
param_grid = {
    'n_neighbors': range(1, 31),  # Try odd numbers from 1 to 30
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

In [6]:
# Perform Grid Search
grid_search = GridSearchCV(
    KNeighborsClassifier(), 
    param_grid, 
    cv=5, 
    scoring='accuracy', 
    n_jobs=-1
)
grid_search.fit(X_train_scaled, y_train)


In [7]:
# Best model
best_knn = grid_search.best_estimator_

# Predictions
y_pred = best_knn.predict(X_test_scaled)

# Model Evaluation
print("Best Parameters:", grid_search.best_params_)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 28, 'weights': 'distance'}

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       102
           1       1.00      1.00      1.00       103

    accuracy                           1.00       205
   macro avg       1.00      1.00      1.00       205
weighted avg       1.00      1.00      1.00       205



In [8]:
# Confusion Matrix Visualization
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - K-Nearest Neighbors')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('knn_confusion_matrix.png')
plt.close()

In [14]:
# Feature Importance using Mutual Information
feature_importance = mutual_info_classif(X_train_scaled, y_train)

# Create Feature Importance DataFrame
feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': feature_importance
})
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

# Visualize Feature Importance
plt.figure(figsize=(10, 6))
plt.bar(feature_importance_df['feature'], feature_importance_df['importance'])
plt.title('Feature Importance (Mutual Information)')
plt.xlabel('Features')
plt.ylabel('Mutual Information')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('knn_feature_importance.png')
plt.close()

# Print Feature Importance
print("\nFeature Importance: ")
print(feature_importance_df)


Feature Importance: 
     feature  importance
4       chol    0.254171
9    oldpeak    0.166448
11        ca    0.156658
12      thal    0.156023
7    thalach    0.155734
2         cp    0.130874
8      exang    0.105336
0        age    0.102707
10     slope    0.077317
3   trestbps    0.065602
1        sex    0.034031
6    restecg    0.010263
5        fbs    0.008589


In [10]:
# Learning Curve (Neighbors vs Accuracy)
neighbors_range = range(1, 31)
accuracy_scores = []

for n in neighbors_range:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train_scaled, y_train)
    accuracy_scores.append(knn.score(X_test_scaled, y_test))

plt.figure(figsize=(10, 6))
plt.plot(neighbors_range, accuracy_scores, marker='o')
plt.title('Accuracy vs Number of Neighbors')
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.tight_layout()
plt.savefig('knn_learning_curve.png')
plt.close()


In [11]:
# Preventive Care Strategy Recommendations
def generate_preventive_care_recommendations(importance_df):
    # Select top 5 most important features
    top_features = importance_df['feature'].head(5).tolist()
    
    recommendations_map = {
        'thalach': "Heart Rate Management: Maximum heart rate shows high importance. "
                   "Focus on cardiovascular fitness assessments and personalized exercise programs.",
        
        'oldpeak': "Exercise-Induced Stress Monitoring: ST depression is critical. "
                   "Recommend regular cardiac stress tests and careful monitoring of exercise-induced cardiac stress.",
        
        'ca': "Cardiovascular Assessment: Number of major vessels is a key indicator. "
              "Comprehensive cardiovascular diagnostic procedures are crucial.",
        
        'chol': "Cholesterol Management: Implement targeted interventions including "
                "dietary modifications, regular exercise, and potential medication.",
        
        'trestbps': "Blood Pressure Control: Prioritize regular monitoring, stress reduction "
                    "techniques, and potential medical interventions for hyper."
    }
    
    recommendations = []
    for feature in top_features:
        if feature in recommendations_map:
            recommendations.append(recommendations_map[feature])
    
    return recommendations

preventive_care_recommendations = generate_preventive_care_recommendations(feature_importance_df)
print("\nPreventive Care Recommendations:")
for rec in preventive_care_recommendations:
    print(f"- {rec}")


Preventive Care Recommendations:
- Cholesterol Management: Implement targeted interventions including dietary modifications, regular exercise, and potential medication.
- Heart Rate Management: Maximum heart rate shows high importance. Focus on cardiovascular fitness assessments and personalized exercise programs.
- Cardiovascular Assessment: Number of major vessels is a key indicator. Comprehensive cardiovascular diagnostic procedures are crucial.
- Exercise-Induced Stress Monitoring: ST depression is critical. Recommend regular cardiac stress tests and careful monitoring of exercise-induced cardiac stress.


In [12]:
print("\nModel Performance Insights:")
print(f"Best Number of Neighbors: {grid_search.best_params_['n_neighbors']}")
print(f"Best Weights: {grid_search.best_params_['weights']}")
print(f"Best Distance Metric: {grid_search.best_params_['metric']}")
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")


Model Performance Insights:
Best Number of Neighbors: 28
Best Weights: distance
Best Distance Metric: manhattan
Best Cross-Validation Accuracy: 0.9854
