# Logistic regression Step by Step for predicting Heart disease

##  Importing Libraries

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing dataset

In [2]:
dataset=pd.read_csv(r'C:\Users\ankus\OneDrive\Desktop\Naresh IT\April\28th,29th_April\framingham.csv')

In [3]:
dataset.head(3)

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0


# Variables
Response
* TenYearCHD: 10-year risk of coronary heart disease (1/0)

Explanatory
* male: 1: male, 0: female
* age: age of participant
* sysBP: systolic blood pressure
* totChol: total cholesterol level

# Update some column name 
* male-> gender 1= male, 0-female
* TenYearCHD - HD = heart disease

In [4]:
dataset.rename(columns = {'male':'gender', 'TenYearCHD':'HD'}, inplace = True)
dataset.head(1)

Unnamed: 0,gender,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,HD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0


In [5]:
dataset.shape

(4238, 16)

In [6]:
dataset.describe()

Unnamed: 0,gender,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,HD
count,4238.0,4238.0,4133.0,4238.0,4209.0,4185.0,4238.0,4238.0,4238.0,4188.0,4238.0,4238.0,4219.0,4237.0,3850.0,4238.0
mean,0.429212,49.584946,1.97895,0.494101,9.003089,0.02963,0.005899,0.310524,0.02572,236.721585,132.352407,82.893464,25.802008,75.878924,81.966753,0.151958
std,0.495022,8.57216,1.019791,0.500024,11.920094,0.169584,0.076587,0.462763,0.158316,44.590334,22.038097,11.91085,4.080111,12.026596,23.959998,0.359023
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,107.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.07,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.4,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,144.0,89.875,28.04,83.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,696.0,295.0,142.5,56.8,143.0,394.0,1.0


In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4238 entries, 0 to 4237
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   gender           4238 non-null   int64  
 1   age              4238 non-null   int64  
 2   education        4133 non-null   float64
 3   currentSmoker    4238 non-null   int64  
 4   cigsPerDay       4209 non-null   float64
 5   BPMeds           4185 non-null   float64
 6   prevalentStroke  4238 non-null   int64  
 7   prevalentHyp     4238 non-null   int64  
 8   diabetes         4238 non-null   int64  
 9   totChol          4188 non-null   float64
 10  sysBP            4238 non-null   float64
 11  diaBP            4238 non-null   float64
 12  BMI              4219 non-null   float64
 13  heartRate        4237 non-null   float64
 14  glucose          3850 non-null   float64
 15  HD               4238 non-null   int64  
dtypes: float64(9), int64(7)
memory usage: 529.9 KB


In [8]:
dataset.head(1)

Unnamed: 0,gender,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,HD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0


## Split the data

In [9]:
# We have to predict the HD column given the features.
X = dataset.drop(['HD'], axis = 1) # independent variable ( Remove mpg from X data)
y = dataset[['HD']] #dependent variable

In [10]:
X.head(2)

Unnamed: 0,gender,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0


In [11]:
y.head(2)

Unnamed: 0,HD
0,0
1,0


# Taking care of Missing Values and Null values

In [12]:
X.isnull().sum()

gender               0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
dtype: int64

In [13]:
X.isnull().sum().sum()

645

# Column wise fill missing (nan) values using mean

In [14]:
X['education'].fillna(value=X['education'].mean(), inplace=True)

In [15]:
X.isnull().sum()

gender               0
age                  0
education            0
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
dtype: int64

In [16]:
X['cigsPerDay'].fillna(value=X['cigsPerDay'].mean(), inplace=True)
X['BPMeds'].fillna(value=X['BPMeds'].mean(), inplace=True)
X['totChol'].fillna(value=X['totChol'].mean(), inplace=True)
X['BMI'].fillna(value=X['BMI'].mean(), inplace=True)
X['heartRate'].fillna(value=X['heartRate'].mean(), inplace=True)
X['glucose'].fillna(value=X['glucose'].mean(), inplace=True)
X.isnull().sum()

gender             0
age                0
education          0
currentSmoker      0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
dtype: int64

# Splitting Dataset- Xtrain and y Train

In [17]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.2,random_state=0)

# Feature Scaling for Improving model Performance

In [18]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

# Model buidling with logistic regression model

In [19]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train,y_train)

  return f(**kwargs)


LogisticRegression()

# Predicting test set results

In [20]:
y_pred = classifier.predict(x_test)

# Evaluating Confusion Matrix

In [21]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[708   2]
 [129   9]]


# Accuracy of Model

In [22]:
from sklearn.metrics import accuracy_score
ac=accuracy_score(y_test,y_pred)
print(ac)

0.8455188679245284


# Bias and Vriance

In [23]:
bias= classifier.score(x_train,y_train)
variance=classifier.score(x_test,y_test)
print(bias)
print(variance)

0.8592920353982301
0.8455188679245284
