In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Heart_disease 2.csv')

In [3]:
df.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CVD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4240 entries, 0 to 4239
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   male             4240 non-null   int64  
 1   age              4240 non-null   int64  
 2   education        4135 non-null   float64
 3   currentSmoker    4240 non-null   int64  
 4   cigsPerDay       4211 non-null   float64
 5   BPMeds           4187 non-null   float64
 6   prevalentStroke  4240 non-null   int64  
 7   prevalentHyp     4240 non-null   int64  
 8   diabetes         4240 non-null   int64  
 9   totChol          4190 non-null   float64
 10  sysBP            4240 non-null   float64
 11  diaBP            4240 non-null   float64
 12  BMI              4221 non-null   float64
 13  heartRate        4239 non-null   float64
 14  glucose          3852 non-null   float64
 15  CVD              4240 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 530.1 KB


In [5]:
df.columns

Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'CVD'],
      dtype='object')

In [6]:
df.describe()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,CVD
count,4240.0,4240.0,4135.0,4240.0,4211.0,4187.0,4240.0,4240.0,4240.0,4190.0,4240.0,4240.0,4221.0,4239.0,3852.0,4240.0
mean,0.429245,49.580189,1.979444,0.494104,9.005937,0.029615,0.005896,0.310613,0.025708,236.699523,132.354599,82.897759,25.800801,75.878981,81.963655,0.151887
std,0.495027,8.572942,1.019791,0.500024,11.922462,0.169544,0.076569,0.462799,0.15828,44.591284,22.0333,11.910394,4.07984,12.025348,23.954335,0.358953
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,144.0,90.0,28.04,83.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [7]:
df.ndim

2

## Data preprocessing

In [8]:
df.isnull().sum()

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
CVD                  0
dtype: int64

In [9]:
df.dropna(axis=0,inplace=True)

In [10]:
df.isnull().sum()

male               0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
CVD                0
dtype: int64

In [11]:
X = df.drop(columns=['CVD'])
y = df['CVD']

In [12]:
display(X.head())
display(y.head())

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0


0    0
1    0
2    0
3    1
4    0
Name: CVD, dtype: int64

In [13]:
ss = StandardScaler()
ss.fit(X)
df_scaled = ss.transform(X)

In [14]:
df_scaled

array([[ 1.11975484, -1.23257968,  1.97520875, ...,  0.29205038,
         0.35637003, -0.20304365],
       [-0.89305263, -0.41490473,  0.01924946, ...,  0.72500993,
         1.60846859, -0.24488308],
       [ 1.11975484, -0.18128332, -0.95873018, ..., -0.1089292 ,
        -0.06099616, -0.49591969],
       ...,
       [-0.89305263,  0.28595951,  0.01924946, ..., -1.06094821,
         0.35637003,  1.05213942],
       [ 1.11975484, -1.11576897,  0.99722911, ..., -0.04496926,
        -0.72878206, -0.41224082],
       [-0.89305263, -1.23257968,  0.99722911, ..., -1.19870806,
         0.77373621, -0.07752534]])

## Building Decision tree and Logistic models

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
l_model = LogisticRegression()

In [27]:
l_model.fit(X_train, y_train)

LogisticRegression()

In [19]:
dt_model = DecisionTreeClassifier()

In [21]:
dt_model.fit(X_train, y_train)

DecisionTreeClassifier()

In [31]:
l_pred = l_model.predict(X_test)
dt_pred = dt_model.predict(X_test)

In [32]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, l_pred))
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))

Logistic Regression Accuracy: 0.8319672131147541
Decision Tree Accuracy: 0.7349726775956285


## Comapring results after tuning using GridSearchCV

In [33]:
logistic_param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

# Define parameter grid for Decision Tree
tree_param_grid = {'max_depth': [None, 5, 10, 15, 20],
                   'min_samples_split': [2, 5, 10],
                   'min_samples_leaf': [1, 2, 4]}


In [36]:
# For Logistic 
logistic_grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=logistic_param_grid, cv=5)
logistic_grid_search.fit(X_train, y_train)
best_logistic_model = logistic_grid_search.best_estimator_
best_logistic_pred = best_logistic_model.predict(X_test)

In [38]:
# For decisio tree
tree_grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=tree_param_grid, cv=5)
tree_grid_search.fit(X_train, y_train)
best_tree_model = tree_grid_search.best_estimator_
best_tree_pred = best_tree_model.predict(X_test)

In [42]:
print("Logistic Regression Accuracy after tuning:", accuracy_score(y_test, best_logistic_pred))
print("Decision Tree Accuracy after tuning:", accuracy_score(y_test, best_tree_pred))

Logistic Regression Accuracy after tuning: 0.8333333333333334
Decision Tree Accuracy after tuning: 0.8306010928961749


## Interpretation

In [43]:
print("Classification Report for Logistic Regression after tuning:")
print(classification_report(y_test, best_logistic_pred))

Classification Report for Logistic Regression after tuning:
              precision    recall  f1-score   support

           0       0.84      1.00      0.91       610
           1       0.50      0.02      0.03       122

    accuracy                           0.83       732
   macro avg       0.67      0.51      0.47       732
weighted avg       0.78      0.83      0.76       732



In [44]:
print("Classification Report for Decision Tree after tuning:")
print(classification_report(y_test, best_tree_pred))

Classification Report for Decision Tree after tuning:
              precision    recall  f1-score   support

           0       0.84      0.99      0.91       610
           1       0.44      0.06      0.10       122

    accuracy                           0.83       732
   macro avg       0.64      0.52      0.50       732
weighted avg       0.77      0.83      0.77       732

