## Import and Load Data

In [1]:
import numpy as np
import pandas as pd

heart_df = pd.read_csv('dataset/heart.csv')
print(heart_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB
None


## Data processing

In [2]:
heart_df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [3]:
heart_df.duplicated().sum()

np.int64(0)

In [4]:
heart_df.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

In [5]:
cat_col = heart_df.select_dtypes(include='object').columns

### converting categorical variables into numeric
- Sex: M=0 , F=1
- ChecstPainType: ATA= 0, NAP=1, ASY=2, TA=3
- RestingECG: Normal=0, ST=1, LVH=2
- ExerciseAngina: N=0, Y=1
- ST_Slope: Up=0, Flat=1, Down=2











In [6]:
for col in cat_col:
    print(col)
    print((heart_df[col].unique()), list(range(heart_df[col].nunique())))
    heart_df[col] = heart_df[col].replace((heart_df[col].unique()), range(heart_df[col].nunique()))
    print('*'*90)
    print()

Sex
['M' 'F'] [0, 1]
******************************************************************************************

ChestPainType
['ATA' 'NAP' 'ASY' 'TA'] [0, 1, 2, 3]
******************************************************************************************

RestingECG
['Normal' 'ST' 'LVH'] [0, 1, 2]
******************************************************************************************

ExerciseAngina
['N' 'Y'] [0, 1]
******************************************************************************************

ST_Slope
['Up' 'Flat' 'Down'] [0, 1, 2]
******************************************************************************************



  heart_df[col] = heart_df[col].replace((heart_df[col].unique()), range(heart_df[col].nunique()))


In [7]:
heart_df['Cholesterol'].value_counts()

Cholesterol
0      172
254     11
223     10
220     10
230      9
      ... 
392      1
316      1
153      1
466      1
131      1
Name: count, Length: 222, dtype: int64

Imputing the 0 values in cholesterol column with KNN Imputer

In [8]:
heart_df['Cholesterol'] = heart_df['Cholesterol'].replace(0, np.nan)

In [9]:
import sys
!{sys.executable} -m pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [10]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)
after_impute = imputer.fit_transform(heart_df)
heart_df = pd.DataFrame(after_impute, columns=heart_df.columns)

In [11]:
heart_df["Cholesterol"].isna().sum()

np.int64(0)

### Doing the same for resting blood pressure

In [12]:
from sklearn.impute import KNNImputer
heart_df['RestingBP'] = heart_df['RestingBP'].replace(0, np.nan)
imputer = KNNImputer(n_neighbors=3)
after_impute = imputer.fit_transform(heart_df)
heart_df = pd.DataFrame(after_impute, columns=heart_df.columns)

In [13]:
heart_df["RestingBP"].isna().sum()

np.int64(0)

### Change columns type to int

In [14]:
withoutOldPeak = heart_df.columns
withoutOldPeak = withoutOldPeak.drop('Oldpeak')
heart_df[withoutOldPeak] = heart_df[withoutOldPeak].astype('int32')

In [15]:
pip install plotly

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Train Test split

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    heart_df.drop('HeartDisease', axis=1),
    heart_df['HeartDisease'],
    test_size=0.2,
    random_state=42,
    stratify=heart_df['HeartDisease']
)

## Model Training

### Logistic regression (Great for binary classification)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Scale the features first
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform hyper parameter tuning to find the best solver for a Logistic Regression model
solver = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
best_solver = ''
test_score = np.zeros(6) # Array to store each solver's score

for i, n in enumerate(solver):
    lr = LogisticRegression(solver=n, max_iter=1000).fit(X_train_scaled, y_train)  # Increased max_iter
    test_score[i] = lr.score(X_test_scaled, y_test)
    if lr.score(X_test_scaled, y_test) == test_score.max():
        best_solver = n

lr = LogisticRegression(solver=best_solver, max_iter=1000)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
print(f'Logistic regression score: {accuracy_score(y_test, lr_pred)}')

Logistic regression score: 0.8586956521739131


### Support Vector Machine

In [18]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score

kernels = {'linear': 0, 'poly': 0, 'rbf': 0, 'sigmoid': 0}
best = ''

for i in kernels:
    svm = SVC(kernel=i)
    svm.fit(X_train, y_train)
    y_hat = svm.predict(X_test)
    kernels[i] = f1_score(y_test, y_hat, average="weighted")
    if kernels[i] == max(kernels.values()):
        best = i

svm = SVC(kernel=best)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print(f"SVM f1_score kernel({best}): {f1_score(y_test, svm_pred, average='weighted')}")

SVM f1_score kernel(linear): 0.8422922535440344


### Decision tree classifier

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dtree = DecisionTreeClassifier(class_weight='balanced')
param_grid = {
    'max_depth': [3,4,5,6,7,8],
    'min_samples_split': [2,3,4],
    'min_samples_leaf': [1,2,3,4],
    'random_state': [0,42]
}

grid_search = GridSearchCV(dtree, param_grid, cv=5)
grid_search.fit(X_train, y_train)
Ctree = DecisionTreeClassifier(**grid_search.best_params_, class_weight='balanced')
Ctree.fit(X_train, y_train)
dtc_pred = Ctree.predict(X_test)
print("DecisionTrees's Accuracy: ", accuracy_score(y_test, dtc_pred))

DecisionTrees's Accuracy:  0.8097826086956522


### Random Forest Classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 150, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3,5,9,19],
    'max_leaf_nodes': [3,6,9]
}
grid_search = GridSearchCV(rfc, param_grid)
grid_search.fit(X_train, y_train)
rfctree = RandomForestClassifier(**grid_search.best_params_)
rfctree.fit(X_train, y_train)
rfc_pred = rfctree.predict(X_test)
print("Random forest classifier's accuracy: ", accuracy_score(y_test, rfc_pred))

Random forest classifier's accuracy:  0.8478260869565217


## Dumping and Loading the model via Pickle object

### Logistic Regression model

In [38]:
import pickle
import os

# Save the model properly
with open('LR_model.pkl', 'wb') as file:
    pickle.dump(lr, file)

# Check file size
file_size = os.path.getsize('LogisticRegression_model.pkl')
print(f"Pickle file size: {file_size} bytes")

Pickle file size: 761 bytes


### SVM model

In [None]:
# Save the model properly
with open('SVM_model.pkl', 'wb') as file:
    pickle.dump(svm, file)

# Check file size
file_size = os.path.getsize('SVM_model.pkl')
print(f"Pickle file size: {file_size} bytes")

Pickle file size: 31537 bytes


### Decision Tree Classifier model

In [None]:
# Save the model properly
with open('DTC_model.pkl', 'wb') as file:
    pickle.dump(Ctree, file)

# Check file size
file_size = os.path.getsize('DTC_model.pkl')
print(f"Pickle file size: {file_size} bytes")

Pickle file size: 2588 bytes


### Random Forest Classifier model

In [37]:
# Save the model properly
with open('RFC_model.pkl', 'wb') as file:
    pickle.dump(rfctree, file)

# Check file size
file_size = os.path.getsize('RFC_model.pkl')
print(f"Pickle file size: {file_size} bytes")

Pickle file size: 119892 bytes
