In [None]:
# 1. Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer

In [None]:
# 2. Create outputs directory
os.makedirs("outputs", exist_ok=True)

In [None]:
# 3. Load dataset (UCI Heart Disease â€” processed Cleveland dataset)
# Option A: If you have the CSV locally, change the path. Option B: Use this common cleaned CSV URL.
# If running offline, download "heart.csv" (the common cleaned UCI version) to working dir.
# Many cleaned copies are called "heart.csv" with columns: age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target

csv_path = "heart.csv"  # replace if necessary
if not os.path.exists(csv_path):
    print("File heart.csv not found in working directory.")
    print("Please download the UCI/cleaned heart dataset and place as 'heart.csv'.")
else:
    df = pd.read_csv(csv_path)
    print("Loaded dataset:", csv_path)

# If the file is present, proceed
try:
    df
except NameError:
    raise SystemExit("Load the dataset and restart.")

In [None]:
# 4. Quick overview
print("\n--- Data Info ---\n")
print(df.info())
print("\n--- Head ---\n")
display(df.head())
print("\nShape:", df.shape)
print("\nMissing values per column:\n", df.isnull().sum())

In [None]:
# 5. Basic stats (numeric)
desc = df.describe().T
desc['skew'] = df.skew()
desc['kurtosis'] = df.kurtosis()
desc['median'] = df.median()
display(desc)

# Save summary to CSV for report
desc.to_csv("outputs/descriptive_stats.csv")

In [None]:
# 6. Target distribution
plt.figure(figsize=(5,4))
sns.countplot(x='target', data=df)
plt.title('Target Distribution (0 = No disease, 1 = Disease)')
plt.xlabel('target')
plt.ylabel('count')
plt.tight_layout()
plt.savefig("outputs/target_distribution.png")
plt.close()

In [None]:
# 7. Numeric distributions (histograms)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove('target')
for col in num_cols:
    plt.figure(figsize=(6,3.5))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution: {col}')
    plt.tight_layout()
    plt.savefig(f"outputs/dist_{col}.png")
    plt.close()

In [None]:
# 8. Boxplots for outlier detection (select important numeric features)
for col in ['chol','trestbps','oldpeak','thalach']:
    if col in df.columns:
        plt.figure(figsize=(6,3))
        sns.boxplot(x=df[col])
        plt.title(f'Boxplot: {col}')
        plt.tight_layout()
        plt.savefig(f"outputs/box_{col}.png")
        plt.close()

In [None]:
# 9. Correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Correlation Matrix")
plt.tight_layout()
plt.savefig("outputs/correlation_matrix.png")
plt.close()

In [None]:
# 10. Scatter plots: age vs thalach and chol vs age
if 'age' in df.columns and 'thalach' in df.columns:
    plt.figure(figsize=(6,4))
    sns.scatterplot(x='age', y='thalach', hue='target', data=df, alpha=0.8)
    plt.title('Age vs Max Heart Rate (thalach)')
    plt.tight_layout()
    plt.savefig("outputs/age_v_thalach.png")
    plt.close()

if 'age' in df.columns and 'chol' in df.columns:
    plt.figure(figsize=(6,4))
    sns.scatterplot(x='age', y='chol', hue='target', data=df, alpha=0.8)
    plt.title('Age vs Serum Cholesterol (chol)')
    plt.tight_layout()
    plt.savefig("outputs/age_v_chol.png")
    plt.close()

In [None]:
# 11. Preprocessing: define feature sets
# Define numerical and categorical columns based on typical UCI heart dataset
num_features = ['age','trestbps','chol','thalach','oldpeak']
cat_features = [c for c in df.columns if c not in num_features + ['target']]

print("Numeric features:", num_features)
print("Categorical features:", cat_features)

In [None]:
# 12. Build preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [None]:
# 13. Train-test split
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

In [None]:
# 14. Model pipelines
lr_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, solver='liblinear'))
])

dt_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('clf', DecisionTreeClassifier(random_state=42))
])

In [None]:
# 15. Train models
print("\nTraining Logistic Regression...")
lr_pipeline.fit(X_train, y_train)
print("Training Decision Tree...")
dt_pipeline.fit(X_train, y_train)

In [None]:
# 16. Predict and evaluate helper
def evaluate_model(pipeline, X_test, y_test, model_name="Model"):
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n--- {model_name} Evaluation ---")
    print("Accuracy:", round(acc,4))
    print("Precision:", round(prec,4))
    print("Recall:", round(rec,4))
    print("F1-score:", round(f1,4))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))
    # Save confusion matrix plot
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix: {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f"outputs/cm_{model_name.replace(' ','_')}.png")
    plt.close()
    return {'accuracy':acc,'precision':prec,'recall':rec,'f1':f1, 'confusion':cm}

In [None]:
# 17. Evaluate both models
lr_metrics = evaluate_model(lr_pipeline, X_test, y_test, "Logistic Regression")
dt_metrics = evaluate_model(dt_pipeline, X_test, y_test, "Decision Tree")

In [None]:
# 18. Optional: Cross-validation for logistic regression
cv_scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='accuracy')
print("\nLogistic Regression CV accuracy (5-fold):", cv_scores, "Mean:", cv_scores.mean())

In [None]:
# 19. Optional: Hyperparameter tuning for Decision Tree (GridSearch)
param_grid = {
    'clf__max_depth': [2,3,4,5,6,7,None],
    'clf__min_samples_split': [2,4,6,8]
}
grid = GridSearchCV(dt_pipeline, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
print("\nBest Decision Tree params:", grid.best_params_)
best_dt = grid.best_estimator_
best_dt_metrics = evaluate_model(best_dt, X_test, y_test, "Decision Tree (Tuned)")

In [None]:
# 20. Visualize Decision Tree (simple)
# Plot tree from best_dt (needs feature names after preprocessing)
# We'll get feature names from preprocessor
onehot_cols = []
if hasattr(preprocessor.named_transformers_['cat'].named_steps['onehot'], 'get_feature_names_out'):
    cat_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_features)
else:
    cat_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names(cat_features)
feature_names = num_features + list(cat_names)
try:
    plt.figure(figsize=(16,10))
    plot_tree(best_dt.named_steps['clf'], feature_names=feature_names, filled=True, max_depth=3, fontsize=8)
    plt.title("Decision Tree (truncated depth=3)")
    plt.tight_layout()
    plt.savefig("outputs/decision_tree.png")
    plt.close()
except Exception as e:
    print("Decision tree plotting failed:", e)


In [None]:
# 21. Feature importance (from decision tree)
try:
    importances = best_dt.named_steps['clf'].feature_importances_
    fi = pd.Series(importances, index=feature_names).sort_values(ascending=False)[:20]
    plt.figure(figsize=(8,4))
    sns.barplot(x=fi.values, y=fi.index)
    plt.title("Feature Importances (Decision Tree)")
    plt.tight_layout()
    plt.savefig("outputs/feature_importances.png")
    plt.close()
except Exception as e:
    print("Feature importance error:", e)

In [None]:
# 22. Save trained models if you like (joblib)
# !pip install joblib
import joblib
joblib.dump(lr_pipeline, "outputs/logistic_pipeline.joblib")
joblib.dump(best_dt, "outputs/decision_tree_tuned_pipeline.joblib")

In [None]:
# 23. Save evaluation summary
eval_summary = pd.DataFrame([
    {'model':'Logistic Regression', **{k:round(v,4) for k,v in lr_metrics.items() if k in ['accuracy','precision','recall','f1']}},
    {'model':'Decision Tree', **{k:round(v,4) for k,v in dt_metrics.items() if k in ['accuracy','precision','recall','f1']}},
    {'model':'Decision Tree (Tuned)', **{k:round(v,4) for k,v in best_dt_metrics.items() if k in ['accuracy','precision','recall','f1']}}
])
eval_summary.to_csv("outputs/evaluation_summary.csv", index=False)
display(eval_summary)

print("\nAll outputs saved in the 'outputs/' folder. Insert the PNGs into your Word file for the report.")