# Disease Prediction from Medical Data
## CodeAlpha Machine Learning Internship - Task 4

This notebook demonstrates disease prediction using multiple machine learning algorithms.

## 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report,
                             roc_curve, auc, roc_auc_score)
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

print("‚úì Libraries imported successfully!")

## 2. Load Dataset

In [None]:
# Load data
data = pd.read_csv('heart_disease.csv')

print(f"Dataset Shape: {data.shape}")
print(f"\nFirst few rows:")
data.head()

In [None]:
# Dataset info
data.info()

In [None]:
# Statistical summary
data.describe()

## 3. Exploratory Data Analysis

In [None]:
# Check for missing values
print("Missing Values:")
missing = data.isnull().sum()
if missing.sum() == 0:
    print("No missing values found!")
else:
    print(missing[missing > 0])

In [None]:
# Target distribution
print("Target Variable Distribution:")
print(data['target'].value_counts())

plt.figure(figsize=(8, 6))
data['target'].value_counts().plot(kind='bar', color=['#3498db', '#e74c3c'])
plt.title('Disease Distribution', fontsize=16, fontweight='bold')
plt.xlabel('Disease Presence', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=0)
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(14, 10))
correlation = data.corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, linewidths=0.5)
plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold')
plt.show()

## 4. Data Preprocessing

In [None]:
# Separate features and target
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("‚úì Data preprocessing completed!")

## 5. Model Training

In [None]:
# Define models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='logloss')
}

# Train models
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
    print(f"CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    
print("\n‚úì All models trained!")

## 6. Model Evaluation

In [None]:
# Evaluate models
results = {}

for name, model in models.items():
    print(f"\n{'='*40}")
    print(f"{name}")
    print(f"{'='*40}")
    
    # Predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    print(f"\nAccuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    print(f"ROC-AUC:   {roc_auc:.4f}")
    
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred))

## 7. Visualizations

In [None]:
# Model comparison
metrics_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Accuracy': [r['accuracy'] for r in results.values()],
    'Precision': [r['precision'] for r in results.values()],
    'Recall': [r['recall'] for r in results.values()],
    'F1-Score': [r['f1_score'] for r in results.values()],
    'ROC-AUC': [r['roc_auc'] for r in results.values()]
})

metrics_df

In [None]:
# ROC Curves
plt.figure(figsize=(12, 8))

for name in results.keys():
    fpr, tpr, _ = roc_curve(y_test, results[name]['y_pred_proba'])
    roc_auc = results[name]['roc_auc']
    plt.plot(fpr, tpr, linewidth=2, label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=13)
plt.ylabel('True Positive Rate', fontsize=13)
plt.title('ROC Curves - All Models', fontsize=16, fontweight='bold')
plt.legend(loc="lower right", fontsize=11)
plt.grid(alpha=0.3)
plt.show()

In [None]:
# Confusion matrices
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
fig.suptitle('Confusion Matrices', fontsize=18, fontweight='bold')

for idx, (name, result) in enumerate(results.items()):
    ax = axes[idx // 2, idx % 2]
    cm = confusion_matrix(y_test, result['y_pred'])
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
               cbar_kws={'label': 'Count'}, annot_kws={'size': 14})
    ax.set_title(name, fontsize=14, fontweight='bold')
    ax.set_ylabel('Actual', fontsize=12)
    ax.set_xlabel('Predicted', fontsize=12)

plt.tight_layout()
plt.show()

In [None]:
# Feature importance
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('Feature Importance Analysis', fontsize=16, fontweight='bold')

# Random Forest
rf_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': models['Random Forest'].feature_importances_
}).sort_values('importance', ascending=False)

axes[0].barh(rf_importance['feature'][:10], rf_importance['importance'][:10], color='#2ecc71')
axes[0].set_xlabel('Importance', fontsize=12)
axes[0].set_title('Random Forest - Top 10 Features', fontsize=13, fontweight='bold')
axes[0].invert_yaxis()

# XGBoost
xgb_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': models['XGBoost'].feature_importances_
}).sort_values('importance', ascending=False)

axes[1].barh(xgb_importance['feature'][:10], xgb_importance['importance'][:10], color='#e74c3c')
axes[1].set_xlabel('Importance', fontsize=12)
axes[1].set_title('XGBoost - Top 10 Features', fontsize=13, fontweight='bold')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## 8. Best Model Selection

In [None]:
# Find best model
best_model_name = max(results.items(), key=lambda x: x[1]['accuracy'])[0]
best_model = models[best_model_name]

print(f"üèÜ Best Model: {best_model_name}")
print(f"   Accuracy:  {results[best_model_name]['accuracy']:.4f}")
print(f"   Precision: {results[best_model_name]['precision']:.4f}")
print(f"   Recall:    {results[best_model_name]['recall']:.4f}")
print(f"   F1-Score:  {results[best_model_name]['f1_score']:.4f}")
print(f"   ROC-AUC:   {results[best_model_name]['roc_auc']:.4f}")

## 9. Making Predictions

In [None]:
# Example prediction
sample_patient = [63, 1, 3, 145, 233, 1, 0, 150, 0, 2.3, 0, 0, 1]

sample_scaled = scaler.transform([sample_patient])
prediction = best_model.predict(sample_scaled)[0]
probability = best_model.predict_proba(sample_scaled)[0]

print("\nNEW PATIENT PREDICTION")
print("=" * 40)
print(f"Using model: {best_model_name}")
print(f"\nPrediction: {'Disease Detected' if prediction == 1 else 'No Disease'}")
print(f"Confidence: {probability[prediction] * 100:.2f}%")
print(f"Probability of Disease: {probability[1] * 100:.2f}%")
print(f"Probability of No Disease: {probability[0] * 100:.2f}%")

## 10. Conclusion

This project successfully demonstrated:
- Disease prediction using multiple ML algorithms
- Comprehensive model evaluation and comparison
- Feature importance analysis
- Production-ready prediction system

**Next Steps:**
- Hyperparameter tuning
- Deep learning models
- Deployment as web API