## Download heart disease dataset heart.csv in Exercise folder and do following, (credits of dataset: https://www.kaggle.com/fedesoriano/heart-failure-prediction)

1. Load heart disease dataset in pandas dataframe
2. Remove outliers using Z score. Usual guideline is to remove anything that has Z score > 3 formula or Z score < -3
3. Convert text columns to numbers using label encoding and one hot encoding
4. Apply scaling
5. Build a classification model using various methods (SVM, logistic regression, random forest) and check which model gives you the best accuracy
   Now use PCA to reduce dimensions, retrain your model and see what impact it has on your model in terms of accuracy.
6. Keep in mind that many times doing PCA reduces the accuracy but computation is much lighter and that's the trade off you need to consider while building models in real life

In [3]:
import pandas as pd

In [5]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [13]:
df.shape


(918, 12)

### Removing outliers by z-score


In [15]:
import pandas as pd
import numpy as np
from scipy import stats

In [29]:
df_clean = df[(np.abs(stats.zscore(df['RestingBP'])) < 3) & 
              (np.abs(stats.zscore(df['Cholesterol'])) < 3) & 
              (np.abs(stats.zscore(df['MaxHR'])) < 3) & 
                (np.abs(stats.zscore(df['Oldpeak'])) < 3)]

In [36]:
df_clean

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


### converting all str columns to numeric

In [46]:
df_clean['ExerciseAngina'] = df_clean['ExerciseAngina'].apply(lambda x: '0' if x =='N' else '1')
df_clean['Sex'] = df_clean['Sex'].apply(lambda x: '0' if x =='F' else '1')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['ExerciseAngina'] = df_clean['ExerciseAngina'].apply(lambda x: '0' if x =='N' else '1')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['Sex'] = df_clean['Sex'].apply(lambda x: '0' if x =='F' else '1')


In [58]:
df_C = pd.get_dummies(df_clean, columns=['ChestPainType', 'RestingECG', 'ST_Slope'], drop_first=True, dtype=int)

In [60]:
df_C

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,1,0.0,0,1,0,0,1,0,0,1
1,49,0,160,180,0,156,1,1.0,1,0,1,0,1,0,1,0
2,37,1,130,283,0,98,1,0.0,0,1,0,0,0,1,0,1
3,48,0,138,214,0,108,1,1.5,1,0,0,0,1,0,1,0
4,54,1,150,195,0,122,1,0.0,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,1,1.2,1,0,0,1,1,0,1,0
914,68,1,144,193,1,141,1,3.4,1,0,0,0,1,0,1,0
915,57,1,130,131,0,115,1,1.2,1,0,0,0,1,0,1,0
916,57,0,130,236,0,174,1,0.0,1,1,0,0,0,0,1,0


### Feature scaling technique and splitting into test train


In [66]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [79]:
X = df_C.drop(['HeartDisease'], axis=1)
y = df_C.HeartDisease

In [81]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)   # learns mean & std from train
X_test_scaled = scaler.transform(X_test)         # uses same mean & std

## model selection

In [87]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [89]:
models_params = {
    "SVM": {
        "model": SVC(),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale", "auto"]
        }
    },
    "LogisticRegression": {
        "model": LogisticRegression(max_iter=5000),
        "params": {
            "C": [0.1, 1, 10],
            "solver": ["liblinear", "lbfgs"]
        }
    },
    "RandomForest": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 10, 20],
            "criterion": ["gini", "entropy"]
        }
    }
}

# Run GridSearchCV for each model
best_models = {}
for name, mp in models_params.items():
    print(f"\n🔍 Running GridSearch for {name}...")
    grid = GridSearchCV(mp["model"], mp["params"], cv=5, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train_scaled, y_train)
    best_models[name] = grid.best_estimator_
    
    print(f"✅ Best params for {name}: {grid.best_params_}")
    print(f"📊 Best CV score: {grid.best_score_:.4f}")
    print(f"🧪 Test score: {grid.score(X_test_scaled, y_test):.4f}")
    print(classification_report(y_test, grid.predict(X_test_scaled)))


🔍 Running GridSearch for SVM...
✅ Best params for SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
📊 Best CV score: 0.8484
🧪 Test score: 0.8944
              precision    recall  f1-score   support

           0       0.97      0.80      0.88        86
           1       0.84      0.98      0.91        94

    accuracy                           0.89       180
   macro avg       0.91      0.89      0.89       180
weighted avg       0.91      0.89      0.89       180


🔍 Running GridSearch for LogisticRegression...
✅ Best params for LogisticRegression: {'C': 0.1, 'solver': 'liblinear'}
📊 Best CV score: 0.8498
🧪 Test score: 0.8944
              precision    recall  f1-score   support

           0       0.96      0.81      0.88        86
           1       0.85      0.97      0.91        94

    accuracy                           0.89       180
   macro avg       0.90      0.89      0.89       180
weighted avg       0.90      0.89      0.89       180


🔍 Running GridSearch for Rando

**RandomForest score looks better**

In [265]:
model = RandomForestClassifier( criterion='entropy',
    max_depth=20,
    n_estimators=50,
    random_state=42)
model.fit(X_train_scaled, y_train)
model.score(X_test_scaled, y_test)

0.8888888888888888

### Use of PCA to avoid unnecessary components

In [109]:
from sklearn.decomposition import PCA

In [259]:
pc = PCA(0.95)
X_pc = pc.fit_transform(X)
X_pc.shape

(899, 2)

In [261]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pc, y, test_size=0.2, random_state=42)
sc = StandardScaler()
X_train_pca_scaled = sc.fit_transform(X_train_pca)   # learns mean & std from train
X_test_pca_scaled = sc.transform(X_test_pca)   



In [263]:
model = RandomForestClassifier(criterion='entropy',
    max_depth=20,
    n_estimators=50,
    random_state=42)
model.fit(X_train_pca_scaled, y_train)
model.score(X_test_pca_scaled, y_test)


0.7055555555555556