In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB
from scipy import stats

In [2]:
df=pd.read_csv('heart_cleveland_upload.csv')

In [3]:
df.shape

(297, 14)

In [4]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,?,0,2,0,0
2,66,0,0,?,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,?,1,1.8,1,0,0,0


In [5]:
df.replace('?', np.nan, inplace=True)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160.0,234,1,2,131.0,0,0.1,1,1,0,0
1,69,0,0,140.0,239,0,0,151.0,0,,0,2,0,0
2,66,0,0,,226,0,0,114.0,0,2.6,2,0,0,0
3,65,1,0,138.0,282,1,2,174.0,0,1.4,1,1,0,1
4,64,1,0,110.0,211,0,2,,1,1.8,1,0,0,0


In [6]:
df = df.apply(pd.to_numeric, errors='coerce')

In [7]:
for col in df.columns:
    if df[col].min() < 0:
        df = df[df[col] >= 0]

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160.0,234,1,2,131.0,0,0.1,1,1,0,0
1,69,0,0,140.0,239,0,0,151.0,0,,0,2,0,0
2,66,0,0,,226,0,0,114.0,0,2.6,2,0,0,0
3,65,1,0,138.0,282,1,2,174.0,0,1.4,1,1,0,1
4,64,1,0,110.0,211,0,2,,1,1.8,1,0,0,0


In [8]:
df.dropna(inplace=True)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160.0,234,1,2,131.0,0,0.1,1,1,0,0
3,65,1,0,138.0,282,1,2,174.0,0,1.4,1,1,0,1
5,64,1,0,170.0,227,0,2,155.0,0,0.6,1,0,2,0
6,63,1,0,145.0,233,1,2,150.0,0,2.3,2,0,1,0
7,61,1,0,134.0,234,0,0,145.0,0,2.6,1,2,0,1


In [9]:
Q1 = df.quantile(0.25)   # 25th percentile
Q3 = df.quantile(0.75)   # 75th percentile
IQR = Q3 - Q1            # Interquartile Range (middle 50% of data)

# Outlier bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove rows where any column has a value outside the IQR range
df = df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
23,74,0,1,120.0,269,0,2,121.0,1,0.2,0,1,0,0
24,71,0,1,160.0,302,0,0,162.0,0,0.4,0,2,0,0
25,70,1,1,156.0,245,0,2,143.0,0,0.0,0,0,0,0
27,63,0,1,140.0,195,0,0,179.0,0,0.0,0,2,0,0
28,62,1,1,120.0,281,0,2,103.0,0,1.4,1,1,2,1


In [10]:
X = df.drop(columns='condition')  # Assuming 'target' column exists
y = df['condition']

In [11]:
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,74.0,0.0,1.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,0.0,1.0,0.0
1,71.0,0.0,1.0,160.0,302.0,0.0,0.0,162.0,0.0,0.4,0.0,2.0,0.0
2,70.0,1.0,1.0,156.0,245.0,0.0,2.0,143.0,0.0,0.0,0.0,0.0,0.0
3,63.0,0.0,1.0,140.0,195.0,0.0,0.0,179.0,0.0,0.0,0.0,2.0,0.0
4,62.0,1.0,1.0,120.0,281.0,0.0,2.0,103.0,0.0,1.4,1.0,1.0,2.0


In [12]:
pt = PowerTransformer()
X = pd.DataFrame(pt.fit_transform(X), columns=X.columns)

X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,2.322534,-1.394433,-1.526609,-0.522715,0.5833,0.0,1.079343,-1.30648,1.455214,-0.619074,-0.976391,1.22071,-0.830014
1,1.977902,-1.394433,-1.526609,2.014902,1.26465,0.0,-0.934204,0.444922,-0.687184,-0.241766,-0.976391,1.548674,-0.830014
2,1.86346,0.717137,-1.526609,1.781343,0.064477,0.0,1.079343,-0.45395,-0.687184,-1.116175,-0.976391,-0.752132,-0.830014
3,1.068757,-1.394433,-1.526609,0.805702,-1.093823,0.0,-0.934204,1.38348,-0.687184,-1.116175,-0.976391,1.548674,-0.830014
4,0.956176,0.717137,-1.526609,-0.522715,0.835088,0.0,1.079343,-1.862246,-0.687184,0.819762,0.91716,1.22071,1.249272


In [13]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,2.322534,-1.394433,-1.526609,-0.522715,0.5833,0.0,1.079343,-1.30648,1.455214,-0.619074,-0.976391,1.22071,-0.830014
1,1.977902,-1.394433,-1.526609,2.014902,1.26465,0.0,-0.934204,0.444922,-0.687184,-0.241766,-0.976391,1.548674,-0.830014
2,1.86346,0.717137,-1.526609,1.781343,0.064477,0.0,1.079343,-0.45395,-0.687184,-1.116175,-0.976391,-0.752132,-0.830014
3,1.068757,-1.394433,-1.526609,0.805702,-1.093823,0.0,-0.934204,1.38348,-0.687184,-1.116175,-0.976391,1.548674,-0.830014
4,0.956176,0.717137,-1.526609,-0.522715,0.835088,0.0,1.079343,-1.862246,-0.687184,0.819762,0.91716,1.22071,1.249272


In [14]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# 1. Logistic Regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_lr = logreg.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)

acc_lr

0.8372093023255814

In [16]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84        22
           1       0.85      0.81      0.83        21

    accuracy                           0.84        43
   macro avg       0.84      0.84      0.84        43
weighted avg       0.84      0.84      0.84        43



In [17]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)

acc_knn

0.8604651162790697

In [19]:
print(classification_report(y_test, y_pred_knn))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86        22
           1       0.86      0.86      0.86        21

    accuracy                           0.86        43
   macro avg       0.86      0.86      0.86        43
weighted avg       0.86      0.86      0.86        43



In [20]:
# 3. Naive-Bayes model
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
acc_nb = accuracy_score(y_test, y_pred_nb)

acc_nb

0.8372093023255814

In [21]:
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84        22
           1       0.85      0.81      0.83        21

    accuracy                           0.84        43
   macro avg       0.84      0.84      0.84        43
weighted avg       0.84      0.84      0.84        43

