In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings

warnings.filterwarnings("ignore", message="X does not have valid feature names")

In [4]:
file = "heart.csv"
data_heart = pd.read_csv(file)
data_heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [160]:
data_heart.shape

(1025, 14)

In [6]:
data_heart.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [161]:
#Checking if is null
data_heart.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [162]:
#Selecting duplicate rows except last - based on all columns
duplicate_data = data_heart[data_heart.duplicated()]
#Duplicate_data is equal to 723, thats why I need to remove duplicate data
data_heart.drop_duplicates(inplace=True)
data_heart.reset_index(drop=True, inplace=True)
data_heart.shape

(302, 14)

In [163]:
#Chcecking propotion people with health heart (0) and defective heart (1)
data_heart["target"].value_counts()

target
1    164
0    138
Name: count, dtype: int64

In [164]:
#Splitting target from data
X = data_heart.drop(columns="target", axis=1)
Y = data_heart["target"]

In [165]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,68,0,2,120,211,0,0,115,0,1.5,1,0,2
298,44,0,2,108,141,0,1,175,0,0.6,1,0,2
299,52,1,0,128,255,0,1,161,1,0.0,2,1,3
300,59,1,3,160,273,0,0,125,0,0.0,2,0,2


In [166]:
Y

0      0
1      0
2      0
3      0
4      0
      ..
297    1
298    1
299    0
300    0
301    0
Name: target, Length: 302, dtype: int64

In [167]:
#Data training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(302, 13) (241, 13) (61, 13)


In [168]:
#Model training

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

model = LogisticRegression()
model.fit(X_train_scaled, Y_train)

#Checking accuracy
X_train_prediction = model.predict(X_train_scaled)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
training_data_accuracy

0.8547717842323651

In [169]:
#Model testing

X_test_scaled = scaler.transform(X_test)

#Checking accuracy
X_test_prediction = model.predict(X_test_scaled)
testing_data_accuracy = accuracy_score(X_test_prediction, Y_test)
testing_data_accuracy

0.819672131147541

In [170]:
#Building model

# age = int(input("Podaj wiek: "))
# sex = int(input("Podaj płeć (0 - kobieta, 1 - mężczyzna): "))
# cp = int(input("Podaj typ bólu w klatce piersiowej (0-3): "))
# trestbps = int(input("Podaj ciśnienie krwi: "))
# chol = int(input("Podaj poziom cholesterolu: "))
# fbs = int(input("Podaj cukier we krwi (0 - <120mg/dl, 1 - >=120mg/dl): "))
# restecg = int(input("Podaj wynik elektrokardiografii (0-2): "))
# thalach = int(input("Podaj maksymalne tętno: "))
# exang = int(input("Podaj wynik testu wysiłkowego (0 - brak, 1 - obecny): "))
# oldpeak = float(input("Podaj wartość ST (zmiana ST na EKG): "))
# slope = int(input("Podaj nachylenie odcinka ST (0-2): "))
# ca = int(input("Podaj liczbę głównych naczyń (0-4): "))
# thal = int(input("Podaj wynik testu thal (1 - normalny, 2 - ustabilizowany defekt, 3 - stały defekt): "))

# input_data = (age, sex, cp,trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal)
input_data = (60,1,0,117,230,1,1,160,1,1.4,2,2,3)
input_data_as_numpy_array= np.asarray(input_data)
input_data_scaled = scaler.transform(input_data_as_numpy_array.reshape(1, -1))

prediction = model.predict(input_data_scaled)
prediction

if prediction[0] == 0:
    print("Osoba o podanych parametrach nie ma choroby serca")
else:
    print("Osoba o podanych parametrach ma chore serce")

Osoba o podanych parametrach nie ma choroby serca
