In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
accuracy_score, precision_score, recall_score,
f1_score, confusion_matrix, classification_report
)

In [67]:
df = pd.read_csv('Cardiovascular_Disease_Dataset.csv')

In [68]:
df.head()

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
0,103368,53,1,2,171,0,0,1,147,0,5.3,3,3,1
1,119250,40,1,0,94,229,0,1,115,0,3.7,1,1,0
2,119372,49,1,2,133,142,0,0,202,1,5.0,1,0,0
3,132514,43,1,0,138,295,1,1,153,0,3.2,2,2,1
4,146211,31,1,1,199,0,0,2,136,0,5.3,3,2,1


In [69]:
df.isnull().sum()

Unnamed: 0,0
patientid,0
age,0
gender,0
chestpain,0
restingBP,0
serumcholestrol,0
fastingbloodsugar,0
restingrelectro,0
maxheartrate,0
exerciseangia,0


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   patientid          1000 non-null   int64  
 1   age                1000 non-null   int64  
 2   gender             1000 non-null   int64  
 3   chestpain          1000 non-null   int64  
 4   restingBP          1000 non-null   int64  
 5   serumcholestrol    1000 non-null   int64  
 6   fastingbloodsugar  1000 non-null   int64  
 7   restingrelectro    1000 non-null   int64  
 8   maxheartrate       1000 non-null   int64  
 9   exerciseangia      1000 non-null   int64  
 10  oldpeak            1000 non-null   float64
 11  slope              1000 non-null   int64  
 12  noofmajorvessels   1000 non-null   int64  
 13  target             1000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 109.5 KB


In [71]:
df.shape

(1000, 14)

In [72]:
df.dtypes

Unnamed: 0,0
patientid,int64
age,int64
gender,int64
chestpain,int64
restingBP,int64
serumcholestrol,int64
fastingbloodsugar,int64
restingrelectro,int64
maxheartrate,int64
exerciseangia,int64


In [73]:
df.isnull().sum()

Unnamed: 0,0
patientid,0
age,0
gender,0
chestpain,0
restingBP,0
serumcholestrol,0
fastingbloodsugar,0
restingrelectro,0
maxheartrate,0
exerciseangia,0


In [74]:
df.duplicated().sum()

np.int64(0)

In [75]:
df = df.drop_duplicates()

In [76]:
df.describe()

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,5048704.0,49.242,0.765,0.98,151.747,311.447,0.296,0.748,145.477,0.498,2.7077,1.54,1.222,0.58
std,2895905.0,17.86473,0.424211,0.953157,29.965228,132.443801,0.456719,0.770123,34.190268,0.500246,1.720753,1.003697,0.977585,0.493805
min,103368.0,20.0,0.0,0.0,94.0,0.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0
25%,2536440.0,34.0,1.0,0.0,129.0,235.75,0.0,0.0,119.75,0.0,1.3,1.0,0.0,0.0
50%,4952508.0,49.0,1.0,1.0,147.0,318.0,0.0,1.0,146.0,0.0,2.4,2.0,1.0,1.0
75%,7681877.0,64.25,1.0,2.0,181.0,404.25,1.0,1.0,175.0,1.0,4.1,2.0,2.0,1.0
max,9990855.0,80.0,1.0,3.0,200.0,602.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,1.0


In [77]:
x = df.drop('target', axis=1)
y = df['target']

In [78]:
y.head()

Unnamed: 0,target
0,1
1,0
2,0
3,1
4,1


In [79]:
scaler = StandardScaler()
scaled_x = scaler.fit_transform(x)

In [80]:
scaled_x

array([[-1.70855459,  0.21046388,  0.55424682, ...,  1.50724524,
         1.45535031,  1.81967847],
       [-1.70306755, -0.51759105,  0.55424682, ...,  0.57695462,
        -0.53828025, -0.22720395],
       [-1.7030254 , -0.01355302,  0.55424682, ...,  1.33281575,
        -0.53828025, -1.25064516],
       ...,
       [ 1.69881811,  1.10653148,  0.55424682, ..., -0.76033815,
         1.45535031, -0.22720395],
       [ 1.70664272, -0.23756992,  0.55424682, ...,  1.85610422,
         1.45535031,  0.79623726],
       [ 1.70745393, -1.35765443,  0.55424682, ...,  1.15838626,
        -1.53509553, -1.25064516]])

In [81]:
x_test,x_train,y_test,y_train = train_test_split(scaled_x, y, test_size=0.3, random_state=42)

In [82]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [83]:
logreg = LogisticRegression(max_iter=1000)

In [84]:
param_grid = {
    "C": [0.001, 0.01, 0.1, 1, 10, 100,],
    "penalty": ["l2"],
    "solver": ["liblinear"]
}

In [85]:
grid_lr = GridSearchCV(logreg, param_grid, cv=5, scoring="f1")
grid_lr.fit(x_train_scaled, y_train)

In [86]:
best_lr = grid_lr.best_estimator_
y_pred_lr = best_lr.predict(x_test_scaled)

In [87]:
# Evaluation
print("Accuracy : ", accuracy_score(y_test, y_pred_lr))
print("precision : ", precision_score(y_test, y_pred_lr))
print("recall : ", recall_score(y_test, y_pred_lr))
print("f1 score : ", f1_score(y_test, y_pred_lr))

confusion_matrix(y_test, y_pred_lr)

Accuracy :  0.95
precision :  0.9636803874092009
recall :  0.9521531100478469
f1 score :  0.9578820697954272


array([[267,  15],
       [ 20, 398]])

In [88]:
dt = DecisionTreeClassifier(random_state=42)
param_dt = {
    "max_depth": [3, 5, 6, 7, 10],
    "min_samples_split": [2, 3, 5, 6, 9,],
    "criterion": ["gini", "entropy"]
}

In [94]:
grid_dt = GridSearchCV(dt, param_dt, cv=5, scoring="f1")
grid_dt.fit(x_train, y_train)

In [95]:
best_dt = grid_dt.best_estimator_
y_pred_dt = best_dt.predict(x_test)

In [96]:
print("classiication", classification_report(y_test, y_pred_dt))

classiication               precision    recall  f1-score   support

           0       0.84      0.94      0.89       282
           1       0.96      0.88      0.92       418

    accuracy                           0.90       700
   macro avg       0.90      0.91      0.90       700
weighted avg       0.91      0.90      0.90       700

