# About the dataset

The dataset contains 14 features and 1 target variable. The target variable is a binary variable that indicates whether a patient has heart disease or not. The features are as follows:

1. age: age in years
2. sex: (1 = male; 0 = female)
3. cp: chest pain type
4. trestbps: resting blood pressure (in mm Hg on admission to the hospital)
5. chol: serum cholestoral in mg/dl
6. fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
7. restecg: resting electrocardiographic results
8. MaxHR: maximum heart rate achieved
9. exang: exercise induced angina (1 = yes; 0 = no)
10. oldpeak: ST depression induced by exercise relative to rest
11. slope: the slope of the peak exercise ST segment
12. ca: number of major vessels (0-3) colored by flourosopy
13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
14. AHD: presence of heart disease in the patient (1 = yes; 0 = no)

In [None]:
# Heart Disease Classification with Outlier Removal, Encoding, Scaling, and PCA

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 1. Load and prepare the dataset

df = pd.read_csv('heart.csv')
df = df.drop('Unnamed: 0', axis=1)  # Drop the index column

In [4]:
# 2. Remove outliers using Z score method
numeric_cols = ['Age', 'RestBP', 'Chol', 'MaxHR', 'Oldpeak']
z_scores = np.abs(stats.zscore(df[numeric_cols]))
mask = (z_scores < 3).all(axis=1)
df_clean = df[mask].reset_index(drop=True)
print('Original shape:', df.shape)
print('Shape after removing outliers:', df_clean.shape)

Original shape: (303, 14)
Shape after removing outliers: (294, 14)


In [5]:
# 3. Encode categorical variables
# For binary columns
le = LabelEncoder()
df_clean['Sex'] = le.fit_transform(df_clean['Sex'])
df_clean['Fbs'] = le.fit_transform(df_clean['Fbs'])
df_clean['ExAng'] = le.fit_transform(df_clean['ExAng'])

df_clean['AHD'] = le.fit_transform(df_clean['AHD'])

In [6]:
# One-hot encoding for multi-category columns
categorical_cols = ['ChestPain', 'RestECG', 'Slope', 'Thal']
df_encoded = pd.get_dummies(df_clean, columns=categorical_cols)


In [7]:
# 3.1: Remove any rows with missing values
initial_rows = df_encoded.shape[0]
df_encoded = df_encoded.dropna()
print('Number of rows removed due to NaN after encoding:', initial_rows - df_encoded.shape[0])

Number of rows removed due to NaN after encoding: 4


In [8]:
# 4. Setup features and target
X = df_encoded.drop('AHD', axis=1)
y = df_encoded['AHD']

In [9]:
# Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Split data without PCA
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# 5. Train and evaluate models
models = {
    'SVM': SVC(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42)
}

results = {}
print('Model Accuracies without PCA:')
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(name + ': ' + str(round(accuracy, 4)))

# Which is best model
best_model_name = max(results, key=results.get)
print('Best model: ' + best_model_name + ' with accuracy: ' + str(round(results[best_model_name], 4)))

Model Accuracies without PCA:
SVM: 0.7931
Logistic Regression: 0.7931
Random Forest: 0.7586
Best model: SVM with accuracy: 0.7931


In [None]:
# Apply PCA on the scaled data
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)
print('Number of components after PCA: ' + str(X_pca.shape[1]) + ' (original features: ' + str(X_scaled.shape[1]) + ')')

# Split PCA-transformed data
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train best model on PCA data
best_model = models[best_model_name]
best_model.fit(X_train_pca, y_train_pca)
y_pred_pca = best_model.predict(X_test_pca)
pca_accuracy = accuracy_score(y_test_pca, y_pred_pca)

Number of components after PCA: 16 (original features: 22)


In [14]:
print('Best model (' + best_model_name + ') accuracy with PCA: ' + str(round(pca_accuracy, 4)))
print('Accuracy difference after PCA: ' + str(round(pca_accuracy - results[best_model_name], 4)))

Best model (SVM) accuracy with PCA: 0.8103
Accuracy difference after PCA: 0.0172
