In [162]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

[Chosen Dataset](https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction)

In [163]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "fedesoriano/heart-failure-prediction",
  "heart.csv"
)

Using Colab cache for faster access to the 'heart-failure-prediction' dataset.


In [164]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


Let's see one more time which columns should be encoded.

In [165]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


Encoding any column of type 'object'(str)

In [166]:
# df['ST_Slope'] = df['ST_Slope'].replace({'Up': 1, 'Flat': 0, 'Down': -1})

In [167]:
for column in df:
  if df[column].dtype == 'object':
    df[column] = pd.factorize(df[column])[0]

In [168]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,0,140,289,0,0,172,0,0.0,0,0
1,49,1,1,160,180,0,0,156,0,1.0,1,1
2,37,0,0,130,283,0,1,98,0,0.0,0,0
3,48,1,2,138,214,0,0,108,1,1.5,1,1
4,54,0,1,150,195,0,0,122,0,0.0,0,0


In [169]:
df.describe()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,0.21024,1.45207,132.396514,198.799564,0.233115,0.603486,136.809368,0.404139,0.887364,0.638344,0.553377
std,9.432617,0.407701,0.851832,18.514154,109.384145,0.423046,0.805968,25.460334,0.490992,1.06657,0.607056,0.497414
min,28.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,0.0,-2.6,0.0,0.0
25%,47.0,0.0,1.0,120.0,173.25,0.0,0.0,120.0,0.0,0.0,0.0,0.0
50%,54.0,0.0,2.0,130.0,223.0,0.0,0.0,138.0,0.0,0.6,1.0,1.0
75%,60.0,0.0,2.0,140.0,267.0,0.0,1.0,156.0,1.0,1.5,1.0,1.0
max,77.0,1.0,3.0,200.0,603.0,1.0,2.0,202.0,1.0,6.2,2.0,1.0


Studying correlation between features.

In [170]:
pd.DataFrame(np.abs(df.corr())).sort_values('HeartDisease', ascending=False)['HeartDisease']

Unnamed: 0,HeartDisease
HeartDisease,1.0
ST_Slope,0.558771
ExerciseAngina,0.494282
ChestPainType,0.459017
Oldpeak,0.403951
MaxHR,0.400421
Sex,0.305445
Age,0.282039
FastingBS,0.267291
Cholesterol,0.232741


In [171]:
df_second = df[['ST_Slope', 'HeartDisease']]

In [172]:
X = df.drop(['HeartDisease'], axis=1, inplace=False)
y = df['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

Again, normalizing the data to prevent overflow

In [173]:
X_mean = X_train.mean(axis=0)
X_std = X_train.std(axis=0)

X_train = (X_train - X_mean) / X_std
X_test  = (X_test  - X_mean) / X_std

The Logistic Regression:

In [174]:
class Log_reg:
    def __init__(self, learning_rate = 5e-2, max_iter = 1e+2):
        self.__learning_rate = learning_rate
        self.__max_iter = max_iter

    def sigmoid(self, y):
        return 1 / (1 + np.exp(-y))

    def fit(self, X, y):
        self.weights = np.zeros(len(X[0]) + 1)
        X = np.hstack((X, np.ones((len(X), 1))))

        for i in range(self.__max_iter):
            pred = self.sigmoid(np.dot(X, self.weights))
            gradient = np.dot(X.T, (pred - y)) / y.size
            self.weights -= gradient * self.__learning_rate

        return self

    def predict_proba(self, X):
        X = np.hstack((X, np.ones((len(X), 1))))
        prob = self.sigmoid(np.dot(X, self.weights))
        return np.hstack(((1 - prob).reshape(-1, 1), prob.reshape(-1, 1)))

    def predict(self, X):
        X = np.hstack((X, np.ones((len(X), 1))))
        return (self.sigmoid(np.dot(X, self.weights)) > 0.5) * 1

In [179]:
logr1 = LogisticRegression(max_iter=500)

logr1.fit(X_train, y_train)

y_pred = logr1.predict(X_test)
accuracy_score(y_test.values, y_pred)

0.875

In [183]:
logr2 = Log_reg(max_iter=500)

logr2.fit(X_train.values, y_train.values)

y_pred = logr2.predict(X_test.values)
accuracy_score(y_test.values, y_pred)

0.875

The score is exactly the same for both models - sklearn one and from-scratch.

In [185]:
X = df_second.drop(['HeartDisease'], axis=1, inplace=False)
y = df_second['HeartDisease']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [186]:
X_mean = X_train.mean(axis=0)
X_std = X_train.std(axis=0)

X_train = (X_train - X_mean) / X_std
X_test  = (X_test  - X_mean) / X_std

In [187]:
logr1 = LogisticRegression(max_iter=500)

logr1.fit(X_train, y_train)

y_pred = logr1.predict(X_test)
accuracy_score(y_test.values, y_pred)

0.7771739130434783

In [188]:
logr2 = Log_reg(max_iter=500)

logr2.fit(X_train.values, y_train.values)

y_pred = logr2.predict(X_test.values)
accuracy_score(y_test.values, y_pred)

0.7771739130434783

Overall, from-scratch implementation is as good as sklearn one!

After leaving only highly correlated features, the perfrmance has dropped. It is due to the fact that even though there are more features that don't correlate so much with target, but they still provide useful information when combined with other features