In [1]:
# 1. Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.impute import SimpleImputer

In [2]:
# 2. Create outputs directory
os.makedirs("outputs", exist_ok=True)

In [3]:
# 3. Load dataset (UCI Heart Disease — processed Cleveland dataset)

csv_path = "heart.csv"
if not os.path.exists(csv_path):
    print("File heart.csv not found in working directory.")
    print("Please download the UCI/cleaned heart dataset and place as 'heart.csv'.")
else:
    df = pd.read_csv(csv_path)
    print("Loaded dataset:", csv_path)

try:
    df
except NameError:
    raise SystemExit("Load the dataset and restart.")

Loaded dataset: heart.csv


In [4]:
# 4. Quick overview
print("\n--- Data Info ---\n")
print(df.info())
print("\n--- Head ---\n")
display(df.head())
print("\nShape:", df.shape)
print("\nMissing values per column:\n", df.isnull().sum())


--- Data Info ---

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB
None

--- Head ---



Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0



Shape: (1025, 14)

Missing values per column:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [5]:
# 5. Basic stats (numeric)
desc = df.describe().T
desc['skew'] = df.skew()
desc['kurtosis'] = df.kurtosis()
desc['median'] = df.median()
display(desc)

# Save summary to CSV for report
desc.to_csv("outputs/descriptive_stats.csv")

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,skew,kurtosis,median
age,1025.0,54.434146,9.07229,29.0,48.0,56.0,61.0,77.0,-0.248866,-0.525618,56.0
sex,1025.0,0.69561,0.460373,0.0,0.0,1.0,1.0,1.0,-0.851449,-1.277531,1.0
cp,1025.0,0.942439,1.029641,0.0,0.0,1.0,2.0,3.0,0.529455,-1.1495,1.0
trestbps,1025.0,131.611707,17.516718,94.0,120.0,130.0,140.0,200.0,0.739768,0.991221,130.0
chol,1025.0,246.0,51.59251,126.0,211.0,240.0,275.0,564.0,1.074073,3.996803,240.0
fbs,1025.0,0.149268,0.356527,0.0,0.0,0.0,0.0,1.0,1.971339,1.889859,0.0
restecg,1025.0,0.529756,0.527878,0.0,0.0,1.0,1.0,2.0,0.18044,-1.309614,1.0
thalach,1025.0,149.114146,23.005724,71.0,132.0,152.0,166.0,202.0,-0.513777,-0.088822,152.0
exang,1025.0,0.336585,0.472772,0.0,0.0,0.0,1.0,1.0,0.692655,-1.523205,0.0
oldpeak,1025.0,1.071512,1.175053,0.0,0.0,0.8,1.8,6.2,1.210899,1.314471,0.8


In [6]:
# 6. Target distribution
plt.figure(figsize=(5,4))
sns.countplot(x='target', data=df)
plt.title('Target Distribution (0 = No disease, 1 = Disease)')
plt.xlabel('target')
plt.ylabel('count')
plt.tight_layout()
plt.savefig("outputs/target_distribution.png")
plt.close()

In [7]:
# 7. Numeric distributions (histograms)
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove('target')
for col in num_cols:
    plt.figure(figsize=(6,3.5))
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution: {col}')
    plt.tight_layout()
    plt.savefig(f"outputs/dist_{col}.png")
    plt.close()

In [8]:
# 8. Boxplots for outlier detection (select important numeric features)
for col in ['chol','trestbps','oldpeak','thalach']:
    if col in df.columns:
        plt.figure(figsize=(6,3))
        sns.boxplot(x=df[col])
        plt.title(f'Boxplot: {col}')
        plt.tight_layout()
        plt.savefig(f"outputs/box_{col}.png")
        plt.close()

In [9]:
# 9. Correlation heatmap
plt.figure(figsize=(10,8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Correlation Matrix")
plt.tight_layout()
plt.savefig("outputs/correlation_matrix.png")
plt.close()

In [10]:
# 10. Scatter plots: age vs thalach and chol vs age
if 'age' in df.columns and 'thalach' in df.columns:
    plt.figure(figsize=(6,4))
    sns.scatterplot(x='age', y='thalach', hue='target', data=df, alpha=0.8)
    plt.title('Age vs Max Heart Rate (thalach)')
    plt.tight_layout()
    plt.savefig("outputs/age_v_thalach.png")
    plt.close()

if 'age' in df.columns and 'chol' in df.columns:
    plt.figure(figsize=(6,4))
    sns.scatterplot(x='age', y='chol', hue='target', data=df, alpha=0.8)
    plt.title('Age vs Serum Cholesterol (chol)')
    plt.tight_layout()
    plt.savefig("outputs/age_v_chol.png")
    plt.close()

In [11]:
# 11. Preprocessing: define feature sets
# Define numerical and categorical columns based on typical UCI heart dataset
num_features = ['age','trestbps','chol','thalach','oldpeak']
cat_features = [c for c in df.columns if c not in num_features + ['target']]

print("Numeric features:", num_features)
print("Categorical features:", cat_features)

Numeric features: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
Categorical features: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']


In [13]:
# 12. Build preprocessing pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])

In [14]:
# 13. Train-test split
X = df.drop(columns=['target'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (820, 13) Test shape: (205, 13)


In [15]:
# 14. Model pipelines
lr_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, solver='liblinear'))
])

dt_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('clf', DecisionTreeClassifier(random_state=42))
])

In [16]:
# 15. Train models
print("\nTraining Logistic Regression...")
lr_pipeline.fit(X_train, y_train)
print("Training Decision Tree...")
dt_pipeline.fit(X_train, y_train)


Training Logistic Regression...
Training Decision Tree...


0,1,2
,steps,"[('preproc', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [19]:
# 16. Predict and evaluate helper
def evaluate_model(pipeline, X_test, y_test, model_name="Model"):
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n--- {model_name} Evaluation ---")
    print("Accuracy:", round(acc,4))
    print("Precision:", round(prec,4))
    print("Recall:", round(rec,4))
    print("F1-score:", round(f1,4))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))
    # Save confusion matrix plot
    plt.figure(figsize=(4,3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix: {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f"outputs/cm_{model_name.replace(' ','_')}.png")
    plt.close()
    return {'accuracy':acc,'precision':prec,'recall':rec,'f1':f1, 'confusion':cm}

In [20]:
# 17. Evaluate both models
lr_metrics = evaluate_model(lr_pipeline, X_test, y_test, "Logistic Regression")
dt_metrics = evaluate_model(dt_pipeline, X_test, y_test, "Decision Tree")


--- Logistic Regression Evaluation ---
Accuracy: 0.8732
Precision: 0.8559
Recall: 0.9048
F1-score: 0.8796

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.84      0.87       100
           1       0.86      0.90      0.88       105

    accuracy                           0.87       205
   macro avg       0.87      0.87      0.87       205
weighted avg       0.87      0.87      0.87       205


--- Decision Tree Evaluation ---
Accuracy: 0.9854
Precision: 1.0
Recall: 0.9714
F1-score: 0.9855

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       100
           1       1.00      0.97      0.99       105

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205



In [None]:
# 18. Cross-validation for logistic regression
cv_scores = cross_val_score(lr_pipeline, X, y, cv=5, scoring='accuracy')
print("\nLogistic Regression CV accuracy (5-fold):", cv_scores, "Mean:", cv_scores.mean())


Logistic Regression CV accuracy (5-fold): [0.87804878 0.86341463 0.89268293 0.84390244 0.81463415] Mean: 0.8585365853658538


In [22]:
# 19. Optional: Hyperparameter tuning for Decision Tree (GridSearch)
param_grid = {
    'clf__max_depth': [2,3,4,5,6,7,None],
    'clf__min_samples_split': [2,4,6,8]
}
grid = GridSearchCV(dt_pipeline, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
print("\nBest Decision Tree params:", grid.best_params_)
best_dt = grid.best_estimator_
best_dt_metrics = evaluate_model(best_dt, X_test, y_test, "Decision Tree (Tuned)")


Best Decision Tree params: {'clf__max_depth': None, 'clf__min_samples_split': 2}

--- Decision Tree (Tuned) Evaluation ---
Accuracy: 0.9854
Precision: 1.0
Recall: 0.9714
F1-score: 0.9855

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       100
           1       1.00      0.97      0.99       105

    accuracy                           0.99       205
   macro avg       0.99      0.99      0.99       205
weighted avg       0.99      0.99      0.99       205



In [23]:
# 20. Visualize Decision Tree (simple)
onehot_cols = []
if hasattr(preprocessor.named_transformers_['cat'].named_steps['onehot'], 'get_feature_names_out'):
    cat_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_features)
else:
    cat_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names(cat_features)
feature_names = num_features + list(cat_names)
try:
    plt.figure(figsize=(16,10))
    plot_tree(best_dt.named_steps['clf'], feature_names=feature_names, filled=True, max_depth=3, fontsize=8)
    plt.title("Decision Tree (truncated depth=3)")
    plt.tight_layout()
    plt.savefig("outputs/decision_tree.png")
    plt.close()
except Exception as e:
    print("Decision tree plotting failed:", e)


In [24]:
# 21. Feature importance (from decision tree)
try:
    importances = best_dt.named_steps['clf'].feature_importances_
    fi = pd.Series(importances, index=feature_names).sort_values(ascending=False)[:20]
    plt.figure(figsize=(8,4))
    sns.barplot(x=fi.values, y=fi.index)
    plt.title("Feature Importances (Decision Tree)")
    plt.tight_layout()
    plt.savefig("outputs/feature_importances.png")
    plt.close()
except Exception as e:
    print("Feature importance error:", e)

In [25]:
# 22. Save trained models if you like (joblib)
# !pip install joblib
import joblib
joblib.dump(lr_pipeline, "outputs/logistic_pipeline.joblib")
joblib.dump(best_dt, "outputs/decision_tree_tuned_pipeline.joblib")

['outputs/decision_tree_tuned_pipeline.joblib']

In [26]:
# 23. Save evaluation summary
eval_summary = pd.DataFrame([
    {'model':'Logistic Regression', **{k:round(v,4) for k,v in lr_metrics.items() if k in ['accuracy','precision','recall','f1']}},
    {'model':'Decision Tree', **{k:round(v,4) for k,v in dt_metrics.items() if k in ['accuracy','precision','recall','f1']}},
    {'model':'Decision Tree (Tuned)', **{k:round(v,4) for k,v in best_dt_metrics.items() if k in ['accuracy','precision','recall','f1']}}
])
eval_summary.to_csv("outputs/evaluation_summary.csv", index=False)
display(eval_summary)

print("\nAll outputs saved in the 'outputs/' folder. Insert the PNGs into your Word file for the report.")

Unnamed: 0,model,accuracy,precision,recall,f1
0,Logistic Regression,0.8732,0.8559,0.9048,0.8796
1,Decision Tree,0.9854,1.0,0.9714,0.9855
2,Decision Tree (Tuned),0.9854,1.0,0.9714,0.9855



All outputs saved in the 'outputs/' folder. Insert the PNGs into your Word file for the report.


In [29]:
from IPython.display import display
display(pd.read_csv("outputs/descriptive_stats.csv").head())

Unnamed: 0.1,Unnamed: 0,count,mean,std,min,25%,50%,75%,max,skew,kurtosis,median
0,age,1025.0,54.434146,9.07229,29.0,48.0,56.0,61.0,77.0,-0.248866,-0.525618,56.0
1,sex,1025.0,0.69561,0.460373,0.0,0.0,1.0,1.0,1.0,-0.851449,-1.277531,1.0
2,cp,1025.0,0.942439,1.029641,0.0,0.0,1.0,2.0,3.0,0.529455,-1.1495,1.0
3,trestbps,1025.0,131.611707,17.516718,94.0,120.0,130.0,140.0,200.0,0.739768,0.991221,130.0
4,chol,1025.0,246.0,51.59251,126.0,211.0,240.0,275.0,564.0,1.074073,3.996803,240.0
