In [None]:
# Install required packages 
!pip install kaggle
!pip install pandas numpy scikit-learn matplotlib seaborn

# Import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("All libraries imported successfully!")

In [None]:
# Direct download from UCI repository
import pandas as pd

# Load the dataset directly from the corrected URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
# The dataset does not have a header row, and the columns are not named.
# We need to provide column names manually based on the dataset description.
column_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
    'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'
]
df = pd.read_csv(url, names=column_names, na_values="?") # Handle missing values represented by '?'

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")

In [None]:
# df = pd.read_csv('heart.csv') or you can use the variable from alternative method

# 1. View first few rows
print("=== First 5 rows of the dataset ===")
print(df.head())
print("\n")

# 2. Check dataset info (data types, non-null counts)
print("=== Dataset Information ===")
print(df.info())
print("\n")

# 3. Summary statistics
print("=== Summary Statistics ===")
print(df.describe())
print("\n")

# 4. Check for missing values
print("=== Missing Values Check ===")
print(df.isnull().sum())
print("\n")

# 5. Check target variable distribution
print("=== Target Variable Distribution ===")
print(df['target'].value_counts())
print(f"Percentage with heart disease: {(df['target'].sum()/len(df)*100):.1f}%")

In [None]:
# Check for missing values again after loading with '?' as NA
print("=== Missing values after loading with na_values='?' ===")
print(df.isnull().sum())
print("\n")

# Handle missing values: Fill missing values in 'ca' and 'thal' with the mode
for col in ['ca', 'thal']:
    if df[col].isnull().any():
        mode_value = df[col].mode()[0]
        df[col].fillna(mode_value, inplace=True)
        print(f"Filled missing values in '{col}' with the mode: {mode_value}")

print("\n=== Missing values after handling ===")
print(df.isnull().sum())

In [None]:
# Converts columns to appropriate data types
# 'ca' and 'thal' are currently float due to missing values, convert them to int
for col in ['ca', 'thal', 'target']:
    df[col] = df[col].astype(int)

print("=== Data types after conversion ===")
print(df.info())

In [None]:
# Explore the distribution of categorical features
categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']

plt.figure(figsize=(15, 10))
for i, col in enumerate(categorical_features):
    plt.subplot(3, 3, i + 1)
    df[col].value_counts().plot(kind='bar', color=sns.color_palette('viridis'))
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

In [None]:
numerical_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_features):
    plt.subplot(2, 3, i + 1)
    sns.histplot(df[col], kde=True, color=sns.color_palette('viridis')[i])
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Create a feature dictionary for reference
feature_dict = {
    'age': 'Age in years',
    'sex': 'Sex (1 = male, 0 = female)',
    'cp': 'Chest pain type (0-3)',
    'trestbps': 'Resting blood pressure (mm Hg)',
    'chol': 'Serum cholesterol (mg/dl)',
    'fbs': 'Fasting blood sugar > 120 mg/dl (1 = true, 0 = false)',
    'restecg': 'Resting ECG results (0-2)',
    'thalach': 'Maximum heart rate achieved',
    'exang': 'Exercise induced angina (1 = yes, 0 = no)',
    'oldpeak': 'ST depression induced by exercise',
    'slope': 'Slope of peak exercise ST segment (0-2)',
    'ca': 'Number of major vessels colored by fluoroscopy (0-3)',
    'thal': 'Thalassemia (0 = normal, 1 = fixed defect, 2 = reversable defect)',
    'target': 'Heart disease presence (1 = yes, 0 = no)'
}

print("=== Feature Descriptions ===")
for feature, description in feature_dict.items():
    print(f"{feature}: {description}")

A solution: install kaggle package first

In [None]:
!pip install --upgrade kaggle
!kaggle datasets download -v -d ronitf/heart-disease-uci --force

In [None]:
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
    print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')

In [None]:
# Create a simple visualization to understand the data
plt.figure(figsize=(12, 5))

# Subplot 1: Target distribution
plt.subplot(1, 2, 1)
df['target'].value_counts().plot(kind='bar', color=['lightcoral', 'lightblue'])
plt.title('Heart Disease Distribution')
plt.xlabel('Target (0=No Disease, 1=Disease)')
plt.ylabel('Count')
plt.xticks(rotation=0)

# Subplot 2: Age distribution by target
plt.subplot(1, 2, 2)
df[df['target']==0]['age'].hist(alpha=0.7, label='No Disease', color='lightblue')
df[df['target']==1]['age'].hist(alpha=0.7, label='Disease', color='lightcoral')
plt.title('Age Distribution by Heart Disease Status')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend()

plt.tight_layout()
plt.show()

Task 2 Starts: Data preprocessing => cleaning and preparing dataset 

In [None]:
# data quality assessment
print("=== DATA QUALITY ASSESSMENT ===\n")

# 1. Check dataset shape
print(f"Dataset shape: {df.shape}")
print(f"Number of patients: {df.shape[0]}")
print(f"Number of features: {df.shape[1] - 1} (excluding target)")

# 2. Check for duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")
if duplicates > 0:
    print("Removing duplicates...")
    df = df.drop_duplicates()
    print(f"New shape after removing duplicates: {df.shape}")

# 3. Check for missing values (detailed)
print("\n=== Missing Values Analysis ===")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_table = pd.DataFrame({
    'Missing Count': missing_data,
    'Percentage': missing_percent
})
print(missing_table[missing_table['Missing Count'] > 0])

# 4. Check data types
print("\n=== Data Types Check ===")
print(df.dtypes)

In [None]:
print("=== FEATURE ANALYSIS ===\n")

# Create a detailed feature analysis
for column in df.columns:
    if column != 'target':
        print(f"\n--- {column.upper()} ---")
        print(f"Data type: {df[column].dtype}")
        print(f"Unique values: {df[column].nunique()}")
        print(f"Min: {df[column].min()}, Max: {df[column].max()}")
        print(f"Value counts:\n{df[column].value_counts().head(10)}")

# Even though UCI dataset typically has no missing values, 
# here's how to handle them if they exist

In [None]:
def handle_missing_values(df):
    """Comprehensive missing value handling"""
    print("=== HANDLING MISSING VALUES ===")
    
    # Check for missing values again
    missing = df.isnull().sum()
    
    if missing.sum() == 0:
        print("No missing values found!")
        return df
    
    # Strategy depends on the feature type
    for column in df.columns:
        if df[column].isnull().sum() > 0:
            print(f"\nHandling missing values in {column}:")
            
            # For numerical features
            if df[column].dtype in ['int64', 'float64']:
                # Use median (more robust than mean)
                median_value = df[column].median()
                df[column].fillna(median_value, inplace=True)
                print(f"  - Filled {df[column].isnull().sum()} missing values with median: {median_value}")
            
            # For categorical features
            else:
                # Use mode (most frequent value)
                mode_value = df[column].mode()[0]
                df[column].fillna(mode_value, inplace=True)
                print(f"  - Filled {df[column].isnull().sum()} missing values with mode: {mode_value}")
    
    return df

# Apply the function
df = handle_missing_values(df)

# Outlier detection and handling 
# Detect outliers using statistical methods

In [None]:

def detect_outliers(df, features):
    """Detect outliers using IQR method"""
    print("=== OUTLIER DETECTION ===")
    
    outlier_indices = []
    
    for feature in features:
        if feature != 'target' and df[feature].dtype in ['int64', 'float64']:
            # Calculate IQR
            Q1 = df[feature].quantile(0.25)
            Q3 = df[feature].quantile(0.75)
            IQR = Q3 - Q1
            
            # Define outlier bounds
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Find outliers
            outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
            
            if len(outliers) > 0:
                print(f"\n{feature}:")
                print(f"  - Lower bound: {lower_bound:.2f}")
                print(f"  - Upper bound: {upper_bound:.2f}")
                print(f"  - Outliers found: {len(outliers)} ({len(outliers)/len(df)*100:.1f}%)")
                
                # Visualize outliers
                plt.figure(figsize=(10, 4))
                
                plt.subplot(1, 2, 1)
                df[feature].hist(bins=30, alpha=0.7)
                plt.axvline(lower_bound, color='red', linestyle='--', label=f'Lower bound: {lower_bound:.1f}')
                plt.axvline(upper_bound, color='red', linestyle='--', label=f'Upper bound: {upper_bound:.1f}')
                plt.title(f'{feature} Distribution with Outlier Bounds')
                plt.legend()
                
                plt.subplot(1, 2, 2)
                df.boxplot(column=feature)
                plt.title(f'{feature} Boxplot')
                
                plt.tight_layout()
                plt.show()
    
    return outlier_indices

# Apply outlier detection
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
detect_outliers(df, numerical_features)

# Feature engineering and selection
# Create new features that might be useful

In [None]:

print("=== FEATURE ENGINEERING ===")

# 1. Age groups
df['age_group'] = pd.cut(df['age'], 
                        bins=[0, 40, 50, 60, 70, 100], 
                        labels=['<40', '40-50', '50-60', '60-70', '70+'])

# 2. Cholesterol categories (using medical standards)
df['chol_category'] = pd.cut(df['chol'], 
                            bins=[0, 200, 240, 1000], 
                            labels=['Desirable', 'Borderline', 'High'])

# 3. Blood pressure categories
df['bp_category'] = pd.cut(df['trestbps'], 
                          bins=[0, 120, 130, 140, 180, 300], 
                          labels=['Normal', 'Elevated', 'Stage1', 'Stage2', 'Crisis'])

# 4. Heart rate efficiency (thalach vs age)
df['heart_rate_efficiency'] = df['thalach'] / df['age']

print("New features created:")
print("- age_group: Categorized age ranges")
print("- chol_category: Cholesterol levels based on medical standards")
print("- bp_category: Blood pressure categories")
print("- heart_rate_efficiency: Ratio of max heart rate to age")

# Data normalization and scalling
# Separate features and target


In [None]:
X = df.drop('target', axis=1)
y = df['target']

print("=== FEATURE SCALING ===")

# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Handle categorical features (if any)
if categorical_features:
    X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
    print(f"After one-hot encoding: {X.shape[1]} features")

# Scale numerical features
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[numerical_features] = scaler.fit_transform(X[numerical_features])

print("\nScaling completed!")
print(f"Original feature range example (age): {X['age'].min():.1f} to {X['age'].max():.1f}")
print(f"Scaled feature range example (age): {X_scaled['age'].min():.2f} to {X_scaled['age'].max():.2f}")

In [None]:
# Train-test split
from sklearn.model_selection import train_test_split

print("=== TRAIN-TEST SPLIT ===")

# Split the data (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training set distribution: {y_train.value_counts().to_dict()}")
print(f"Test set distribution: {y_test.value_counts().to_dict()}")

# Verify the split maintains the class distribution
train_ratio = y_train.sum() / len(y_train)
test_ratio = y_test.sum() / len(y_test)
print(f"\nClass balance check:")
print(f"Training set positive ratio: {train_ratio:.3f}")
print(f"Test set positive ratio: {test_ratio:.3f}")

# Final Data verification 
# Create a comprehensive verification report


In [None]:
print("=== FINAL VERIFICATION ===\n")

def verify_preprocessing(X_train, X_test, y_train, y_test):
    """Verify that preprocessing was successful"""
    
    # 1. Check shapes
    print("1. Shape Verification:")
    print(f"   X_train: {X_train.shape}")
    print(f"   X_test: {X_test.shape}")
    print(f"   y_train: {y_train.shape}")
    print(f"   y_test: {y_test.shape}")
    
    # 2. Check for missing values
    print("\n2. Missing Values Check:")
    print(f"   X_train missing: {X_train.isnull().sum().sum()}")
    print(f"   X_test missing: {X_test.isnull().sum().sum()}")
    
    # 3. Check scaling
    print("\n3. Scaling Verification (first 3 numerical features):")
    numerical_cols = X_train.select_dtypes(include=[np.number]).columns[:3]
    for col in numerical_cols:
        print(f"   {col}: mean={X_train[col].mean():.3f}, std={X_train[col].std():.3f}")
    
    # 4. Check target distribution
    print("\n4. Target Distribution:")
    print(f"   Training: {y_train.value_counts().to_dict()}")
    print(f"   Testing: {y_test.value_counts().to_dict()}")
    
    print("\nâœ… Preprocessing completed successfully!")

# Run verification
verify_preprocessing(X_train, X_test, y_train, y_test)


In [None]:
import pickle

# Save the scaler for later use
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the preprocessed data
preprocessed_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'feature_names': X_train.columns.tolist()
}

with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)

print("Preprocessed data saved!")
print("Files created: scaler.pkl, preprocessed_data.pkl")

# Summary Visualization


In [None]:
import pickle

# Save the scaler for later use
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the preprocessed data
preprocessed_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'feature_names': X_train.columns.tolist()
}

with open('preprocessed_data.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)

print("Preprocessed data saved!")
print("Files created: scaler.pkl, preprocessed_data.pkl")

# Check if one-hot encoding created too many features


In [None]:
print(f"Original features: {X.shape[1]}")
print(f"After preprocessing: {X_train.shape[1]}")

# Task 3: Model Building - Training Multiple Classification Algorithms

In [None]:
# Import additional libraries needed for modeling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
import time

print("=== MODEL BUILDING SETUP ===")
print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")
print(f"Features: {list(X_train.columns)}")

# create a model training framework 
# Create a function to train and evaluate models


In [None]:
def train_model(model, X_train, y_train, X_test, y_test, model_name):
    """
    Train a model and return performance metrics
    
    Parameters:
    model: sklearn model instance
    X_train: training features
    y_train: training target
    X_test: test features
    y_test: test target
    model_name: name of the model for display
    
    Returns:
    Dictionary with model results
    """
    print(f"\n=== Training {model_name} ===")
    
    # Record training time
    start_time = time.time()
    
    # Train the model
    model.fit(X_train, y_train)
    
    training_time = time.time() - start_time
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    
    results = {
        'model_name': model_name,
        'model': model,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'training_time': training_time
    }
    
    print(f"Training completed in {training_time:.3f} seconds")
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Cross-validation: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")
    
    return results

In [None]:
# Model 1 - Logistic Regression
print("=== MODEL 1: LOGISTIC REGRESSION ===")
print("Why Logistic Regression?")
print("- Simple and interpretable")
print("- Good baseline for binary classification")
print("- Provides probability estimates")
print("- Fast training")

# Initialize Logistic Regression
lr_model = LogisticRegression(
    random_state=42,
    max_iter=1000,  # Increase iterations for convergence
    solver='liblinear'  # Good for small datasets
)

# Train the model
lr_results = train_model(lr_model, X_train, y_train, X_test, y_test, "Logistic Regression")

# Show feature importance (coefficients)
feature_importance_lr = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': lr_model.coef_[0],
    'abs_coefficient': np.abs(lr_model.coef_[0])
}).sort_values('abs_coefficient', ascending=False)

print("\nTop 10 Most Important Features (Logistic Regression):")
print(feature_importance_lr.head(10))

In [None]:
# Model 2 - Random Forest
print("\n=== MODEL 2: RANDOM FOREST ===")
print("Why Random Forest?")
print("- Handles non-linear relationships")
print("- Robust to outliers")
print("- Provides feature importance")
print("- Good for mixed data types")

# Initialize Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,  # Number of trees
    random_state=42,
    max_depth=10,  # Prevent overfitting
    min_samples_split=5,  # Minimum samples to split a node
    min_samples_leaf=2,   # Minimum samples in leaf node
    class_weight='balanced'  # Handle class imbalance
)

# Train the model
rf_results = train_model(rf_model, X_train, y_train, X_test, y_test, "Random Forest")

# Show feature importance
feature_importance_rf = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features (Random Forest):")
print(feature_importance_rf.head(10))

In [None]:
# Model 3 - K-Nearest Neighbors 
print("\n=== MODEL 3: K-NEAREST NEIGHBORS ===")
print("Why KNN?")
print("- Simple and intuitive")
print("- Non-parametric (makes no assumptions)")
print("- Good for small datasets")
print("- Instance-based learning")

# We need to find optimal k value first
print("\nFinding optimal k value...")

# Test different k values
k_values = range(1, 21)
cv_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

# Find optimal k
optimal_k = k_values[np.argmax(cv_scores)]
print(f"Optimal k value: {optimal_k}")

# Plot k vs accuracy
plt.figure(figsize=(10, 6))
plt.plot(k_values, cv_scores, marker='o')
plt.xlabel('K Value')
plt.ylabel('Cross-Validation Accuracy')
plt.title('K Value Optimization for KNN')
plt.grid(True)
plt.show()

# Train with optimal k
knn_model = KNeighborsClassifier(
    n_neighbors=optimal_k,
    weights='distance',  # Weight by distance (closer neighbors matter more)
    metric='euclidean'   # Distance metric
)

knn_results = train_model(knn_model, X_train, y_train, X_test, y_test, f"KNN (k={optimal_k})")

In [None]:
# Model 4 - Neural Network 
print("\n=== MODEL 4: NEURAL NETWORK ===")
print("Why Neural Network?")
print("- Can learn complex patterns")
print("- Good for non-linear relationships")
print("- Scalable to large datasets")

# Import MLPClassifier (Multi-layer Perceptron)
from sklearn.neural_network import MLPClassifier

# Initialize Neural Network
nn_model = MLPClassifier(
    hidden_layer_sizes=(100, 50),  # Two hidden layers
    max_iter=1000,
    random_state=42,
    early_stopping=True,  # Prevent overfitting
    validation_fraction=0.1,
    alpha=0.01,  # L2 regularization
    learning_rate_init=0.001
)

print("Training Neural Network (this may take a moment)...")
nn_results = train_model(nn_model, X_train, y_train, X_test, y_test, "Neural Network")

# Show training loss curve
if hasattr(nn_model, 'loss_curve_'):
    plt.figure(figsize=(10, 6))
    plt.plot(nn_model.loss_curve_)
    plt.title('Neural Network Training Loss')
    plt.xlabel('Iterations')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.show()

In [None]:
# Compile all results
all_results = [lr_results, rf_results, knn_results, nn_results]

# Create comparison DataFrame
comparison_df = pd.DataFrame({
    'Model': [result['model_name'] for result in all_results],
    'Accuracy': [result['accuracy'] for result in all_results],
    'Precision': [result['precision'] for result in all_results],
    'Recall': [result['recall'] for result in all_results],
    'F1-Score': [result['f1_score'] for result in all_results],
    'CV Mean': [result['cv_mean'] for result in all_results],
    'CV Std': [result['cv_std'] for result in all_results],
    'Training Time (s)': [result['training_time'] for result in all_results]
})

print("=== MODEL COMPARISON ===")
print(comparison_df.round(3))

# Find best model
best_model_idx = np.argmax([result['f1_score'] for result in all_results])
best_model = all_results[best_model_idx]
print(f"\nBest Model: {best_model['model_name']}")
print(f"F1-Score: {best_model['f1_score']:.3f}")

# Improving the best model

In [None]:
# improve the best model with hyperparameter tuning
print(f"\n=== HYPERPARAMETER TUNING FOR {best_model['model_name']} ===")

# Define parameter grids for each model type
if 'Logistic Regression' in best_model['model_name']:
    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    }
elif 'Random Forest' in best_model['model_name']:
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [5, 10, 15, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
elif 'KNN' in best_model['model_name']:
    param_grid = {
        'n_neighbors': range(3, 20, 2),
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    }

# Perform Grid Search (using a smaller parameter set for speed)
print("Performing grid search...")
grid_search = GridSearchCV(
    best_model['model'].__class__(),
    param_grid,
    cv=3,  # 3-fold CV for speed
    scoring='f1',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")

# Train final model with best parameters
final_model = grid_search.best_estimator_
final_results = train_model(final_model, X_train, y_train, X_test, y_test, 
                           f"Tuned {best_model['model_name']}")

# Save the best performing model

In [None]:
# Save the best performing model
import joblib

# Determine which model to save
if final_results['f1_score'] > best_model['f1_score']:
    model_to_save = final_model
    model_name = f"tuned_{best_model['model_name'].lower().replace(' ', '_')}"
else:
    model_to_save = best_model['model']
    model_name = best_model['model_name'].lower().replace(' ', '_')

# Save model
model_filename = f'best_heart_disease_model_{model_name}.pkl'
joblib.dump(model_to_save, model_filename)

# Save scaler (we'll need it for new predictions)
joblib.dump(scaler, 'scaler.pkl')

print(f"\nðŸ’¾ Model saved as: {model_filename}")
print(f"ðŸ’¾ Scaler saved as: scaler.pkl")

# Create a model summary
model_summary = {
    'best_model': model_name,
    'accuracy': max(final_results['accuracy'], best_model['accuracy']),
    'precision': max(final_results['precision'], best_model['precision']),
    'recall': max(final_results['recall'], best_model['recall']),
    'f1_score': max(final_results['f1_score'], best_model['f1_score']),
    'training_samples': len(X_train),
    'test_samples': len(X_test),
    'features_used': list(X_train.columns)
}

print("\n=== FINAL MODEL SUMMARY ===")
for key, value in model_summary.items():
    print(f"{key}: {value}")

# Create comprehensive feature importance visualization

In [None]:
plt.figure(figsize=(15, 10))

# Get feature importance from the best model
if hasattr(model_to_save, 'feature_importances_'):  # Tree-based models
    importance_df = pd.DataFrame({
        'feature': X_train.columns,
        'importance': model_to_save.feature_importances_
    }).sort_values('importance', ascending=False)

    # Plot top 15 features
    plt.subplot(2, 2, 1)
    top_features = importance_df.head(15)
    plt.barh(top_features['feature'], top_features['importance'])
    plt.xlabel('Importance')
    plt.title(f'Top 15 Features - {model_name}')
    plt.gca().invert_yaxis()

elif hasattr(model_to_save, 'coef_'):  # Linear models
    importance_df = pd.DataFrame({
        'feature': X_train.columns,
        'coefficient': model_to_save.coef_[0],
        'abs_coefficient': np.abs(model_to_save.coef_[0])
    }).sort_values('abs_coefficient', ascending=False)

    # Plot top 15 features
    plt.subplot(2, 2, 1)
    top_features = importance_df.head(15)
    colors = ['red' if x < 0 else 'blue' for x in top_features['coefficient']]
    plt.barh(top_features['feature'], top_features['coefficient'], color=colors)
    plt.xlabel('Coefficient Value')
    plt.title(f'Top 15 Features - {model_name}')
    plt.gca().invert_yaxis()

# Plot model comparison
plt.subplot(2, 2, 2)
models = [result['model_name'] for result in all_results]
accuracies = [result['accuracy'] for result in all_results]
plt.bar(models, accuracies, color=['skyblue', 'lightgreen', 'orange', 'pink'])
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45)

# Plot F1-score comparison
plt.subplot(2, 2, 3)
f1_scores = [result['f1_score'] for result in all_results]
plt.bar(models, f1_scores, color=['skyblue', 'lightgreen', 'orange', 'pink'])
plt.xlabel('Models')
plt.ylabel('F1-Score')
plt.title('Model F1-Score Comparison')
plt.xticks(rotation=45)

# Plot training time comparison
plt.subplot(2, 2, 4)
training_times = [result['training_time'] for result in all_results]
plt.bar(models, training_times, color=['skyblue', 'lightgreen', 'orange', 'pink'])
plt.xlabel('Models')
plt.ylabel('Training Time (seconds)')
plt.title('Training Time Comparison')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# visualization for logistic regression and neural network

In [None]:
# For Logistic Regression
LogisticRegression(max_iter=1000, solver='liblinear')

# For Neural Network
MLPClassifier(max_iter=2000, early_stopping=True)

# visualization for random forest - reduce complexity 

In [None]:
# Random Forest - reduce complexity
RandomForestClassifier(max_depth=10, min_samples_split=5, min_samples_leaf=2)

# Add regularization
LogisticRegression(C=0.1, penalty='l2')

# Use class_weight parameter

In [None]:
# Use class_weight parameter
RandomForestClassifier(class_weight='balanced')

# Or use SMOTE for oversampling
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Model evaluation 

In [None]:
# Import all necessary evaluation libraries
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                           f1_score, confusion_matrix, classification_report,
                           roc_curve, auc, precision_recall_curve)
from sklearn.metrics import matthews_corrcoef, cohen_kappa_score
import seaborn as sns
import matplotlib.pyplot as plt

print("=== MODEL EVALUATION SETUP ===")
print("Models to evaluate:")
for i, result in enumerate(all_results):
    print(f"{i+1}. {result['model_name']}")

# metrics calculation for all models

In [None]:
def calculate_comprehensive_metrics(y_true, y_pred, y_pred_proba, model_name):
    """
    Calculate comprehensive evaluation metrics

    Returns:
        Dictionary with all metrics
    """
    print(f"\n=== {model_name} - Detailed Metrics ===")

    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Advanced metrics
    mcc = matthews_corrcoef(y_true, y_pred)  # Matthews Correlation Coefficient
    kappa = cohen_kappa_score(y_true, y_pred)  # Cohen's Kappa

    # Classification report
    print("Classification Report:")
    print(classification_report(y_true, y_pred, target_names=['No Disease', 'Disease']))

    metrics_dict = {
        'model_name': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'mcc': mcc,
        'kappa': kappa,
        'y_true': y_true,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

    return metrics_dict

# Calculate metrics for all models
all_metrics = []
for result

# confusion matrix interpretation

In [None]:
def plot_confusion_matrices(all_metrics):
    """
    Create confusion matrices for all models
    """
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.ravel()

    for i, metrics in enumerate(all_metrics):
        cm = confusion_matrix(metrics['y_true'], metrics['y_pred'])

        # Create heatmap
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=['No Disease', 'Disease'],
                   yticklabels=['No Disease', 'Disease'],
                   ax=axes[i])

        axes[i].set_title(f'{metrics["model_name"]}\n'
                         f'Accuracy: {metrics["accuracy"]:.3f}')
        axes[i].set_xlabel('Predicted')
        axes[i].set_ylabel('Actual')

        # Add percentage annotations
        total = cm.sum()
        for j in range(2):
            for k in range(2):
                percentage = cm[j, k] / total * 100
                axes[i].text(k + 0.5, j + 0.7, f'({percentage:.1f}%)',
                           ha='center', va='center', fontsize=10, color='red')

    plt.tight_layout()
    plt.show()

    # Print confusion matrix interpretation
    print("\n=== CONFUSION MATRIX INTERPRETATION ===")
    for metrics in all_metrics:
        cm = confusion_matrix(metrics['y_true'], metrics['y_pred'])
        tn, fp, fn, tp = cm.ravel()

        print(f"\n{metrics['model_name']}:")
        print(f"  True Negatives (Correctly predicted no disease): {tn}")
        print(f"  False Positives (Incorrectly predicted disease): {fp}")
        print(f"  False Negatives (Missed disease cases): {fn}")
        print(f"  True Positives (Correctly predicted disease): {tp}")
        print(f"  Sensitivity (Recall): {tp/(tp+fn):.3f}")
        print(f"  Specificity: {tn/(tn+fp):.3f}")

plot_confusion_matrices(all_metrics)

# ROC Curve Analysis

In [None]:
# ROC Curve Analysis
def plot_roc_curves(all_metrics):
    """
    Plot ROC curves for all models
    """
    plt.figure(figsize=(10, 8))

    for metrics in all_metrics:
        fpr, tpr, _ = roc_curve(metrics['y_true'], metrics['y_pred_proba'])
        roc_auc = auc(fpr, tpr)

        plt.plot(fpr, tpr,
                label=f"{metrics['model_name']} (AUC = {roc_auc:.3f})",
                linewidth=2)

    # Plot diagonal line
    plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate (1 - Specificity)')
    plt.ylabel('True Positive Rate (Sensitivity)')
    plt.title('ROC Curves Comparison - Heart Disease Prediction')
    plt.legend(loc="lower right")
    plt.grid(True, alpha=0.3)
    plt.show()

    # Print AUC interpretation
    print("\n=== ROC-AUC INTERPRETATION ===")
    print("AUC (Area Under Curve) Interpretation:")
    print("- 1.0 = Perfect classifier")
    print("- 0.9-1.0 = Excellent")
    print("- 0.8-0.9 = Good")
    print("- 0.7-0.8 = Fair")
    print("- 0.6-0.7 = Poor")
    print("- 0.5 = Random guessing")

    for metrics in all_metrics:
        fpr, tpr, _ = roc_curve(metrics['y_true'], metrics['y_pred_proba'])
        roc_auc = auc(fpr, tpr)
        print(f"\n{metrics['model_name']}: AUC = {roc_auc:.3f}")

plot_roc_curves(all_metrics)


In [None]:
# Precision-Recall Curve Analysis
def plot_precision_recall_curves(all_metrics):
    """
    Plot Precision-Recall curves for all models
    """
    plt.figure(figsize=(10, 8))

    for metrics in all_metrics:
        precision, recall, _ = precision_recall_curve(metrics['y_true'],
                                                     metrics['y_pred_proba'])

        # Calculate Average Precision
        avg_precision = np.mean(precision)

        plt.plot(recall, precision,
                label=f"{metrics['model_name']} (AP = {avg_precision:.3f})",
                linewidth=2)

    # Add baseline (random classifier performance)
    baseline = np.sum(metrics['y_true']) / len(metrics['y_true'])
    plt.axhline(y=baseline, color='k', linestyle='--',
               label=f'Baseline (Random): {baseline:.3f}')

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall (Sensitivity)')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curves - Heart Disease Prediction')
    plt.legend(loc="lower left")
    plt.grid(True, alpha=0.3)
    plt.show()

    # Print interpretation
    print("\n=== PRECISION-RECALL INTERPRETATION ===")
    print("For Heart Disease Prediction:")
    print("- High Precision = Few false alarms (healthy predicted as diseased)")
    print("- High Recall = Few missed cases (disease not detected)")
    print("- Average Precision = Summarizes the curve as a single number")

    # Calculate and display specific metrics
    for metrics in all_metrics:
        precision, recall, thresholds = precision_recall_curve(metrics['y_true'],
                                                              metrics['y_pred_proba'])
        avg_precision = np.mean(precision)
        print(f"\n{metrics['model_name']}:")
        print(f"  Average Precision: {avg_precision:.3f}")
        print(f"  Precision: {metrics['precision']:.3f}")
        print(f"  Recall: {metrics['recall']:.3f}")

plot_precision_recall_curves(all_metrics)

# Advanced Metrics Comparison

In [None]:
# Advanced Metrics Comparison
def create_metrics_comparison_table(all_metrics):
    """
    Create comprehensive metrics comparison table
    """
    # Create detailed comparison dataframe
    metrics_df = pd.DataFrame({
        'Model': [m['model_name'] for m in all_metrics],
        'Accuracy': [m['accuracy'] for m in all_metrics],
        'Precision': [m['precision'] for m in all_metrics],
        'Recall': [m['recall'] for m in all_metrics],
        'F1-Score': [m['f1_score'] for m in all_metrics],
        'MCC': [m['mcc'] for m in all_metrics],
        'Kappa': [m['kappa'] for m in all_metrics]
    })

    # Add AUC scores
    auc_scores = []
    for metrics in all_metrics:
        fpr, tpr, _ = roc_curve(metrics['y_true'], metrics['y_pred_proba'])
        roc_auc = auc(fpr, tpr)
        auc_scores.append(roc_auc)

    metrics_df['AUC'] = auc_scores

    # Round to 3 decimal places
    metrics_df = metrics_df.round(3)

    print("=== COMPREHENSIVE METRICS COMPARISON ===")
    print(metrics_df.to_string(index=False))

    # Highlight best values
    print("\n=== BEST PERFORMING MODELS ===")
    for metric in ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC', 'MCC']:
        best_idx = metrics_df[metric].idxmax()
        best_model = metrics_df.loc[best_idx, 'Model']
        best_score = metrics_df.loc[best_idx, metric]
        print(f"{metric}: {best_model} ({best_score:.3f})")

    return metrics_df

metrics_df = create_metrics_comparison_table(all_metrics)


In [None]:
def interpret_medical_context(all_metrics):
    """
    Interpret results in medical context
    """
    print("\n" + "="*60)
    print("MEDICAL CONTEXT INTERPRETATION")
    print("="*60)

    for metrics in all_metrics:
        cm = confusion_matrix(metrics['y_true'], metrics['y_pred'])
        tn, fp, fn, tp = cm.ravel()

        print(f"\n{metrics['model_name']}:")
        print("-" * 40)

        # Calculate medical metrics
        sensitivity = tp / (tp + fn)  # True Positive Rate
        specificity = tn / (tn + fp)  # True Negative Rate
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0  # Positive Predictive Value
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0  # Negative Predictive Value

        print(f"Sensitivity (Recall): {sensitivity:.3f}")
        print(f"  â†’ Ability to correctly identify patients WITH heart disease")
        print(f"  â†’ {sensitivity*100:.1f}% of diseased patients were correctly identified")

        print(f"\nSpecificity: {specificity:.3f}")
        print(f"  â†’ Ability to correctly identify patients WITHOUT heart disease")
        print(f"  â†’ {specificity*100:.1f}% of healthy patients were correctly identified")

        print(f"\nPositive Predictive Value: {ppv:.3f}")
        print(f"  â†’ When model predicts disease, it's correct {ppv*100:.1f}% of the time")

        print(f"\nNegative Predictive Value: {npv:.3f}")
        print(f"  â†’ When model predicts no disease, it's correct {npv*100:.1f}% of the time")

        # Clinical implications
        print(f"\nClinical Implications:")
        if sensitivity > 0.85:
            print("   EXCELLENT: High sensitivity - few missed cases")
        elif sensitivity > 0.75:
            print("   GOOD: Moderate sensitivity - some missed cases")
        else:
            print("   POOR: Low sensitivity - many missed cases")

        if specificity > 0.85:
            print("   EXCELLENT: High specificity - few false alarms")
        elif specificity > 0.75:
            print("   GOOD: Moderate specificity - some false alarms")
        else:
            print("   POOR: Low specificity - many false alarms")

interpret_medical_context(all_metrics)

In [None]:
# Statistical Significance Testing
from scipy import stats

def perform_statistical_tests(all_metrics):
    """
    Perform statistical significance tests
    """
    print("\n=== STATISTICAL SIGNIFICANCE TESTING ===")

    # McNemar's test for comparing two models
    def mcnemar_test(y_true, model1_pred, model2_pred, model1_name, model2_name):
        from statsmodels.stats.contingency_tables import mcnemar

        # Create contingency table
        table = pd.crosstab(model1_pred == y_true, model2_pred == y_true)

        if table.shape == (2, 2):
            result = mcnemar(table, exact=True)
            print(f"\nMcNemar's Test: {model1_name} vs {model2_name}")
            print(f"  p-value: {result.pvalue:.4f}")
            if result.pvalue < 0.05:
                print("  â†’ Statistically significant difference")
            else:
                print("  â†’ No statistically significant difference")

    # Compare best model with others
    best_metric = max(all_metrics, key=lambda x: x['f1_score'])
    print(f"\nBest model: {best_metric['model_name']}")

    for metrics in all_metrics:
        if metrics['model_name'] != best_metric['model_name']:
            mcnemar_test(
                metrics['y_true'],
                best_metric['y_pred'],
                metrics['y_pred'],
                best_metric['model_name'],
                metrics['model_name']
            )

perform_statistical_tests(all_metrics)

In [None]:
# Error Analysis
def detailed_error_analysis(best_metrics):
    """
    Analyze prediction errors in detail
    """
    print(f"\n=== DETAILED ERROR ANALYSIS FOR {best_metrics['model_name']} ===")

    # Find misclassified cases
    misclassified = X_test.copy()
    misclassified['actual'] = best_metrics['y_true']
    misclassified['predicted'] = best_metrics['y_pred']
    misclassified['correct'] = best_metrics['y_true'] == best_metrics['y_pred']
    misclassified = misclassified[~misclassified['correct']]

    print(f"Total misclassified cases: {len(misclassified)}")

    # Analyze false positives vs false negatives
    false_positives = misclassified[misclassified['actual'] == 0]
    false_negatives = misclassified[misclassified['actual'] == 1]

    print(f"False Positives (healthy predicted as diseased): {len(false_positives)}")
    print(f"False Negatives (diseased predicted as healthy): {len(false_negatives)}")

    # Analyze characteristics of misclassified cases
    if len(false_positives) > 0:
        print("\nFalse Positive Characteristics:")
        print("Average age:", false_positives['age'].mean())
        print("Average cholesterol:", false_positives['chol'].mean())
        print("Average resting BP:", false_positives['trestbps'].mean())

    if len(false_negatives) > 0:
        print("\nFalse Negative Characteristics:")
        print("Average age:", false_negatives['age'].mean())
        print("Average cholesterol:", false_negatives['chol'].mean())
        print("Average resting BP:", false_negatives['trestbps'].mean())

    # Create error analysis visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # Age distribution of errors
    axes[0, 0].hist(false_positives['age'], alpha=0.7, label='False Positives', bins=10)
    axes[0, 0].hist(false_negatives['age'], alpha=0.7, label='False Negatives', bins=10)
    axes[0, 0].set_xlabel('Age')
    axes[0, 0].set_ylabel('Count')
    axes[0, 0].set_title('Age Distribution of Misclassified Cases')
    axes[0, 0].legend()

    # Cholesterol distribution of errors
    axes[0, 1].hist(false_positives['chol'], alpha=0.7, label='False Positives', bins=10)
    axes[0, 1].hist(false_negatives['chol'], alpha=0.7, label='False Negatives', bins=10)
    axes[0, 1].set_xlabel('Cholesterol')
    axes[0, 1].set_ylabel('Count')
    axes[0, 1].set_title('Cholesterol Distribution of Misclassified Cases')
    axes[0, 1].legend()

    # Error types pie chart
    error_counts = [len(false_positives), len(false_negatives)]
    error_labels = ['False Positives', 'False Negatives']
    axes[1, 0].pie(error_counts, labels=error_labels, autopct='%1.1f%%',
                   colors=['lightcoral', 'lightblue'])
    axes[1, 0].set_title('Distribution of Error Types')

    # Feature comparison between correct and incorrect predictions
    correct_predictions = X_test[best_metrics['y_true'] == best_metrics['y_pred']]
    incorrect_predictions = X_test[best_metrics['y_true'] != best_metrics['y_pred']]

    feature_to_plot = 'age'  # You can change this to any feature
    axes[1, 1].hist(correct_predictions[feature_to_plot], alpha=0.7,
                    label='Correct Predictions', bins=15, density=True)
    axes[1, 1].hist(incorrect_predictions[feature_to_plot], alpha=0.7,
                    label='Incorrect Predictions', bins=15, density=True)
    axes[1, 1].set_xlabel(feature_to_plot.title())
    axes[1, 1].set_ylabel('Density')
    axes[1, 1].set_title(f'{feature_to_plot.title()} Distribution: Correct vs Incorrect')
    axes[1, 1].legend()

    plt.tight_layout()
    plt.show()

# Run error analysis for the best model
best_metrics = max(all_metrics, key=lambda x: x['f1_score'])
detailed_error_analysis(best_metrics)

## creating final report and save in a report.txt file

In [None]:
def create_evaluation_report(all_metrics, model_name):
    """
    Create a comprehensive evaluation report
    """
    report = f"""
    HEART DISEASE PREDICTION MODEL - EVALUATION REPORT
    ===================================================
    
    Model: {model_name}
    Evaluation Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}
    
    1. OVERALL PERFORMANCE
    ----------------------
    Accuracy: {max(all_metrics, key=lambda x: x['accuracy'])['accuracy']:.3f}
    Precision: {max(all_metrics, key=lambda x: x['precision'])['precision']:.3f}
    Recall: {max(all_metrics, key=lambda x: x['recall'])['recall']:.3f}
    F1-Score: {max(all_metrics, key=lambda x: x['f1_score'])['f1_score']:.3f}
    AUC-ROC: {max(all_metrics, key=lambda x: x.get('auc', 0)):.3f}
    
    2. CLINICAL PERFORMANCE
    -----------------------
    Sensitivity (Disease Detection Rate): {max(all_metrics, key=lambda x: x['recall'])['recall']:.3f}
    Specificity (Healthy Identification Rate): {max(all_metrics, key=lambda x: x.get('specificity', 0)):.3f}
    
    3. MODEL COMPARISON
    -------------------
    Best Overall Model: {max(all_metrics, key=lambda x: x['f1_score'])['model_name']}
    Most Accurate Model: {max(all_metrics, key=lambda x: x['accuracy'])['model_name']}
    Most Sensitive Model: {max(all_metrics, key=lambda x: x['recall'])['model_name']}
    
    4. CLINICAL INTERPRETATION
    --------------------------
    - The model can correctly identify {max(all_metrics, key=lambda x: x['recall'])['recall']*100:.1f}% of patients with heart disease
    - The model has a precision of {max(all_metrics, key=lambda x: x['precision'])['precision']*100:.1f}% when predicting heart disease
    - Out of every 100 positive predictions, {max(all_metrics, key=lambda x: x['precision'])['precision']*100:.1f} are actually correct
    
    5. RECOMMENDATIONS
    ------------------
    - Model is {'suitable' if max(all_metrics, key=lambda x: x['f1_score'])['f1_score'] > 0.8 else 'needs improvement'} for clinical deployment
    - Consider {'further tuning' if max(all_metrics, key=lambda x: x['f1_score'])['f1_score'] < 0.85 else 'validation on external dataset'}
    """
    
    print(report)
    
    # Save report to file
    with open('model_evaluation_report.txt', 'w') as f:
        f.write(report)
    
    print("\n   Report saved as 'model_evaluation_report.txt'")

# Create final report
best_model_name = max(all_metrics, key=lambda x: x['f1_score'])['model_name']
create_evaluation_report(all_metrics, best_model_name)

# Common evaluation issues and solutions
## issue 1: "My model shows perfect performance" 

In [None]:
# Check for data leakage
print("Check if you're leaking target information:")
print("X_train columns:", X_train.columns)
print("Make sure target variable is not in features")

# Check train-test split
print("Training accuracy:", model.score(X_train, y_train))
print("Test accuracy:", model.score(X_test, y_test))
# If training >> test, you might be overfitting