# Anova Insurance – Health Classification Model
**Objective:** Predict whether an individual is *Healthy (0)* or *Unhealthy (1)* to assist in premium pricing decisions.

In [None]:
# ── 1. IMPORTS ──────────────────────────────────────────────────────────────
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, ConfusionMatrixDisplay, accuracy_score
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Optional – XGBoost
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except ImportError:
    HAS_XGB = False

SEED = 42
sns.set_theme(style='whitegrid', palette='muted')
print('Libraries loaded successfully.')

## 2. Load & Inspect Data

In [None]:
df = pd.read_csv('mDugQt7wQOKNNIAFjVku_Healthcare_Data_Preprocessed_FIXED.csv')
print(f'Shape: {df.shape}')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
print('Missing values per column:')
missing = df.isnull().sum()
print(missing[missing > 0])

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Target distribution
fig, ax = plt.subplots(figsize=(5, 4))
target_counts = df['Target'].value_counts()
ax.bar(['Healthy (0)', 'Unhealthy (1)'], target_counts.values, color=['steelblue', 'tomato'])
ax.set_title('Target Class Distribution')
ax.set_ylabel('Count')
for i, v in enumerate(target_counts.values):
    ax.text(i, v + 30, str(v), ha='center', fontweight='bold')
plt.tight_layout()
plt.show()
print(target_counts / len(df) * 100)

In [None]:
# Numerical feature distributions
num_cols = ['Age', 'BMI', 'Blood_Pressure', 'Cholesterol', 'Glucose_Level',
            'Heart_Rate', 'Sleep_Hours', 'Exercise_Hours', 'Water_Intake', 'Stress_Level']

fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.flatten()
for i, col in enumerate(num_cols):
    axes[i].hist(df[col].dropna(), bins=30, color='steelblue', edgecolor='white')
    axes[i].set_title(col)
plt.suptitle('Numerical Feature Distributions', fontsize=14, y=1.01)
plt.tight_layout()
plt.show()

In [None]:
# Box plots by Target
fig, axes = plt.subplots(2, 5, figsize=(20, 8))
axes = axes.flatten()
for i, col in enumerate(num_cols):
    df.boxplot(column=col, by='Target', ax=axes[i])
    axes[i].set_title(col)
    axes[i].set_xlabel('Target')
plt.suptitle('Numerical Features by Target Class', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Categorical feature counts
cat_cols = ['Smoking', 'Alcohol', 'Diet', 'MentalHealth', 'PhysicalActivity']
fig, axes = plt.subplots(1, 5, figsize=(20, 4))
for i, col in enumerate(cat_cols):
    df.groupby([col, 'Target']).size().unstack().plot(kind='bar', ax=axes[i],
                                                      color=['steelblue', 'tomato'])
    axes[i].set_title(col)
    axes[i].set_xlabel('')
    axes[i].tick_params(axis='x', rotation=0)
plt.suptitle('Categorical Features vs Target', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
bool_cols = ['Diet_Type_Vegan', 'Diet_Type_Vegetarian',
             'Blood_Group_AB', 'Blood_Group_B', 'Blood_Group_O']
corr_df = df.copy()
for c in bool_cols:
    corr_df[c] = corr_df[c].astype(int)

fig, ax = plt.subplots(figsize=(14, 10))
corr_matrix = corr_df.corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm',
            linewidths=0.5, ax=ax)
ax.set_title('Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.show()

## 4. Data Preprocessing

In [None]:
# ── 4.1 Fix negative ages (data entry errors) ──
print(f'Negative age values: {(df["Age"] < 0).sum()}')
df['Age'] = df['Age'].abs()

# ── 4.2 Convert bool columns to int ──
for c in bool_cols:
    df[c] = df[c].astype(int)

# ── 4.3 Feature / target split ──
X = df.drop(columns=['Target'])
y = df['Target']
print(f'Features: {X.shape[1]}, Samples: {X.shape[0]}')

In [None]:
# ── 4.4 Train / Test split (80/20, stratified) ──
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=SEED, stratify=y
)
print(f'Train: {X_train.shape}, Test: {X_test.shape}')

In [None]:
# ── 4.5 Preprocessing pipeline: impute → scale ──
imputer = SimpleImputer(strategy='median')
scaler  = StandardScaler()

X_train_imp = imputer.fit_transform(X_train)
X_test_imp  = imputer.transform(X_test)

X_train_sc = scaler.fit_transform(X_train_imp)
X_test_sc  = scaler.transform(X_test_imp)

print('Preprocessing complete.')

## 5. Model Training & Comparison

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=SEED),
    'Decision Tree':       DecisionTreeClassifier(random_state=SEED),
    'Random Forest':       RandomForestClassifier(n_estimators=200, random_state=SEED),
    'Gradient Boosting':   GradientBoostingClassifier(n_estimators=200, random_state=SEED),
    'KNN':                 KNeighborsClassifier(n_neighbors=7),
    'SVM':                 SVC(probability=True, random_state=SEED),
}
if HAS_XGB:
    models['XGBoost'] = XGBClassifier(n_estimators=200, random_state=SEED,
                                       use_label_encoder=False, eval_metric='logloss')

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
results = {}

for name, model in models.items():
    cv_scores = cross_val_score(model, X_train_sc, y_train, cv=cv, scoring='roc_auc')
    model.fit(X_train_sc, y_train)
    y_pred = model.predict(X_test_sc)
    y_prob = model.predict_proba(X_test_sc)[:, 1]
    results[name] = {
        'CV AUC (mean)': cv_scores.mean(),
        'CV AUC (std)':  cv_scores.std(),
        'Test Accuracy': accuracy_score(y_test, y_pred),
        'Test AUC':      roc_auc_score(y_test, y_prob),
        'model': model,
        'y_pred': y_pred,
        'y_prob': y_prob,
    }
    print(f'{name:<25} CV AUC: {cv_scores.mean():.4f} ± {cv_scores.std():.4f} | '
          f'Test Acc: {accuracy_score(y_test, y_pred):.4f} | '
          f'Test AUC: {roc_auc_score(y_test, y_prob):.4f}')

In [None]:
# Summary DataFrame
results_df = pd.DataFrame([
    {'Model': k, 'CV AUC (mean)': v['CV AUC (mean)'], 'CV AUC (std)': v['CV AUC (std)'],
     'Test Accuracy': v['Test Accuracy'], 'Test AUC': v['Test AUC']}
    for k, v in results.items()
]).sort_values('Test AUC', ascending=False).reset_index(drop=True)
results_df

In [None]:
# Visual comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

results_df.plot(x='Model', y='Test Accuracy', kind='bar', ax=axes[0],
                color='steelblue', legend=False)
axes[0].set_title('Test Accuracy by Model')
axes[0].set_ylim(0.5, 1)
axes[0].tick_params(axis='x', rotation=30)

results_df.plot(x='Model', y='Test AUC', kind='bar', ax=axes[1],
                color='tomato', legend=False)
axes[1].set_title('Test ROC-AUC by Model')
axes[1].set_ylim(0.5, 1)
axes[1].tick_params(axis='x', rotation=30)

plt.tight_layout()
plt.show()

## 6. Best Model – Detailed Evaluation

In [None]:
best_name = results_df.iloc[0]['Model']
best      = results[best_name]
print(f'Best model: {best_name}')

# Classification Report
print('\nClassification Report:')
print(classification_report(y_test, best['y_pred'],
                             target_names=['Healthy', 'Unhealthy']))

In [None]:
# Confusion Matrix
fig, ax = plt.subplots(figsize=(5, 4))
ConfusionMatrixDisplay.from_predictions(
    y_test, best['y_pred'],
    display_labels=['Healthy', 'Unhealthy'],
    cmap='Blues', ax=ax
)
ax.set_title(f'Confusion Matrix – {best_name}')
plt.tight_layout()
plt.show()

In [None]:
# ROC Curves for all models
fig, ax = plt.subplots(figsize=(8, 6))
for name, res in results.items():
    fpr, tpr, _ = roc_curve(y_test, res['y_prob'])
    ax.plot(fpr, tpr, label=f"{name} (AUC={res['Test AUC']:.3f})")
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curves – All Models')
ax.legend(loc='lower right', fontsize=9)
plt.tight_layout()
plt.show()

## 7. Feature Importance

In [None]:
feature_names = list(X.columns)

# Try tree-based importances first, fall back to logistic regression coefficients
model_obj = best['model']

if hasattr(model_obj, 'feature_importances_'):
    importances = model_obj.feature_importances_
    imp_type = 'Feature Importance'
elif hasattr(model_obj, 'coef_'):
    importances = np.abs(model_obj.coef_[0])
    imp_type = '|Coefficient|'
else:
    # Use Random Forest as fallback
    rf_fallback = RandomForestClassifier(n_estimators=100, random_state=SEED)
    rf_fallback.fit(X_train_sc, y_train)
    importances = rf_fallback.feature_importances_
    imp_type = 'RF Feature Importance (fallback)'

imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
imp_df = imp_df.sort_values('Importance', ascending=True).tail(20)

fig, ax = plt.subplots(figsize=(8, 8))
ax.barh(imp_df['Feature'], imp_df['Importance'], color='steelblue')
ax.set_xlabel(imp_type)
ax.set_title(f'{imp_type} – {best_name}')
plt.tight_layout()
plt.show()

## 8. Prediction Function

In [None]:
def predict_health_status(input_dict: dict) -> dict:
    """
    Predict healthy/unhealthy for a new individual.
    
    Parameters
    ----------
    input_dict : dict with feature values (matching column names)
    
    Returns
    -------
    dict with prediction label and probability
    """
    sample = pd.DataFrame([input_dict], columns=feature_names)
    sample_imp = imputer.transform(sample)
    sample_sc  = scaler.transform(sample_imp)
    pred  = model_obj.predict(sample_sc)[0]
    prob  = model_obj.predict_proba(sample_sc)[0][1]
    label = 'Unhealthy' if pred == 1 else 'Healthy'
    return {'prediction': label, 'unhealthy_probability': round(prob, 4)}


# Example usage
example = {
    'Age': 45, 'BMI': 28.5, 'Blood_Pressure': 130, 'Cholesterol': 210,
    'Glucose_Level': 105, 'Heart_Rate': 80, 'Sleep_Hours': 6, 'Exercise_Hours': 0.5,
    'Water_Intake': 1.5, 'Stress_Level': 7, 'Smoking': 2, 'Alcohol': 1, 'Diet': 0,
    'MentalHealth': 1, 'PhysicalActivity': 0, 'MedicalHistory': 1, 'Allergies': 0,
    'Diet_Type_Vegan': 0, 'Diet_Type_Vegetarian': 0,
    'Blood_Group_AB': 0, 'Blood_Group_B': 1, 'Blood_Group_O': 0
}
print(predict_health_status(example))

## 9. Summary

In [None]:
print('=' * 60)
print('MODEL PERFORMANCE SUMMARY')
print('=' * 60)
print(results_df.to_string(index=False))
print()
print(f'Best Model  : {best_name}')
print(f'Test AUC    : {best["Test AUC"]:.4f}')
print(f'Test Accuracy: {best["Test Accuracy"]:.4f}')
print('=' * 60)