In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [15]:
# X = np.array([[1,10,2],[1,15,3],[1,12,2],[1,20,4],[1,8,1]])
# Y = np.array([[130],[180],[140],[250],[100]])
# Z = X.T.dot(X)
# p = np.linalg.inv(Z).dot(X.T)
# p.dot(Y)

In [17]:

# Load the dataset
file_path = "Heart.csv"
print(f"Loading data from {file_path}")
df = pd.read_csv(file_path)


# Display the first few rows
print("First few rows of the dataset:")
df.head()



Loading data from Heart.csv
First few rows of the dataset:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [4]:
# Display the last few rows
print("\nLast few rows of the dataset:")
df.tail(3)


Last few rows of the dataset:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
296,45.0,0,2060,1,60,0,742000.0,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.0,1.4,140,1,1,280,0
298,50.0,0,196,0,45,0,395000.0,1.6,136,1,1,285,0


In [6]:
# Check info
print("\ninfo of this dataset:")
df.info()


info of this dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.

In [18]:
# In this dataset, missing values are represented as zeros in certain columns
# columns_for_replacing = ['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']
# for column in columns_for_replacing:
#     df[column].replace(0,df[column].mean(),inplace = True)

# Mean imputation for numerical columns
imputer_mean = SimpleImputer(strategy='mean')
data_mean_imputed = df.copy()
data_mean_imputed[['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']] = imputer_mean.fit_transform(df[['age','creatinine_phosphokinase','ejection_fraction','platelets','serum_creatinine','serum_sodium','time']])

print('Dataset after mean imputation:')
data_mean_imputed.head()

Dataset after mean imputation:


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582.0,0,20.0,1,265000.0,1.9,130.0,1,0,4.0,1
1,55.0,0,7861.0,0,38.0,0,263358.03,1.1,136.0,1,0,6.0,1
2,65.0,0,146.0,0,20.0,0,162000.0,1.3,129.0,1,1,7.0,1
3,50.0,1,111.0,0,20.0,0,210000.0,1.9,137.0,1,0,7.0,1
4,65.0,1,160.0,1,20.0,0,327000.0,2.7,116.0,0,0,8.0,1


In [10]:
# Normalize the data
scaler = StandardScaler()


#drop target variable from X
X = df.drop('diabetes',axis = 1)

#create "y" containing outcome target variable
y = df['diabetes']

X_scaled = scaler.fit_transform(X)

In [22]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=34)

In [23]:
# Train logistic regression model using Scikit-Learn
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model

y_pred = model.predict(X_test)

In [24]:
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.6000
Precision: 0.5484
Recall: 0.4359
F1-score: 0.4857


In [34]:
X_new = np.array([[45,0,582,0,14,166000,0.8,127,1,0,14,1],[65,1,52,0,25,276000,1.3,137,0,0,16,0],[60,0,2656,1,30,305000,2.3,137,1,0,30,0],[85,0,212,0,38,186000,0.9,136,1,0,187,0]])

In [35]:
# Normalize the new data
new_data_scaled = scaler.fit_transform(X_new)

# Make predictions
new_predictions = model.predict(new_data_scaled)

print("Predictions for new data points:", new_predictions)

Predictions for new data points: [1 1 0 0]
