# Importing the Dependencies

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

# Heart Disease Dataset
- Dataset: Data Collection and Data Processing
- https://www.kaggle.com/datasets/johnsmith88/heart-disease-dataset?resource=download

In [7]:
# loading the dataset to a pandas Dataframe
heart_data = pd.read_csv("Dataset/heart.csv")
heart_data.head()

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  
(1025, 14)


In [9]:
# number of rows and column
heart_data.shape

(1025, 14)

In [8]:
# describe --> statistical measures of the data
heart_data.describe()

target
1    526
0    499
Name: count, dtype: int64

In [10]:
# to count the occurrences of each unique value in the last column
heart_data.iloc[:, -1].value_counts()

target
1    526
0    499
Name: count, dtype: int64

In [19]:
# to group the rows of the Heart DataFrame by the values in the column indexed by 60
# and then calculate the mean of each group for all other columns.
heart_data.groupby(heart_data.columns[-1]).mean()

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,56.569138,0.827655,0.482966,134.106212,251.292585,0.164329,0.456914,139.130261,0.549098,1.6002,1.166333,1.158317,2.539078
1,52.408745,0.570342,1.378327,129.245247,240.979087,0.134981,0.598859,158.585551,0.134981,0.569962,1.593156,0.370722,2.119772


In [30]:
# Separating data and labels
X = heart_data.drop(columns='target', axis=1)
Y = heart_data['target']

In [35]:
# Training and Test data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=1)

In [36]:
print(X.shape)
print(x_train.shape)
print(x_test.shape)

(1025, 13)
(922, 13)
(103, 13)


In [37]:
# Model Training --> Logistic Regression
model = LinearRegression()

In [39]:
# training the Logistic Regression model with training data and labels
model.fit(x_train, y_train)

# Model Evaluation on trainig data

In [59]:
pred_training_data = model.predict(x_train)
pred_training_data[:7]    # continuous predictions --> no binary labels (0, 1)


array([0.11544524, 0.55117417, 0.53511774, 0.06893955, 0.90253445,
       0.72269639, 0.53476819])

In [48]:
# Convert continuous predictions to binary labels
binary_pred_training_data = (pred_training_data > 0.5).astype(int)
acc_training_data = accuracy_score(y_train, binary_pred_training_data )
print(f'Accuracy on training data: {acc_training_data * 100:.2f}%')

Accuracy on training data: 83.73%


# Model Evaluation on testing data

In [67]:
pred_testing_data = model.predict(x_test)
pred_testing_data[:7]

array([0.16194161, 0.99379571, 0.80410445, 0.69933411, 0.11540759,
       0.77372842, 0.70171291])

In [71]:
# Convert continuous predictions to binary labels
binary_pred_testing_data = (pred_testing_data > 0.5).astype(int)
acc_testing_data = accuracy_score(y_test, binary_pred_testing_data)
print(f'Accuracy on testing data: {acc_testing_data * 100:.2f}%')

Accuracy on testing data: 80.58%
