## Importing Relevant Libraries

In [74]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Loading the data

In [12]:
raw_data = pd.read_csv('C:\\Users\\yetne\\Desktop\\heart1.csv')

In [13]:
raw_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [179]:
## Attribute Information
#1.age
#2.sex
#3.chest pain type (4 values)
#4.resting blood pressure
#5.serum cholestoral in mg/dl
#6.fasting blood sugar > 120 mg/dl
#7.resting electrocardiographic results (values 0,1,2)
#8.maximum heart rate achieved
#9.exercise induced angina(Angina is a type of pain that occurs when not enough blood flows to the heart muscle.)
#10.oldpeak = ST depression induced by exercise relative to rest
#11.the slope of the peak exercise ST segment
#12.number of major vessels (0-3) colored by flourosopy
#13.thal(A blood disorder involving lower-than-normal amounts of an oxygen-carrying protein.): 3 = normal; 6 = fixed defect; 7 = reversable defect}

In [18]:
data = raw_data.copy()

## Checking for missing data

In [165]:
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

This means that there are no missing values in the dataset

In [19]:
data.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


## Getting dummies of categorical variables

In [23]:
cp_type = pd.get_dummies(data['cp'],drop_first = True)

In [24]:
cp_type

Unnamed: 0,1,2,3
0,0,0,1
1,0,1,0
2,1,0,0
3,1,0,0
4,0,0,0
...,...,...,...
298,0,0,0
299,0,0,1
300,0,0,0
301,0,0,0


In [39]:
slope_dummies = pd.get_dummies(data['slope'], drop_first = True)
slope_dummies

Unnamed: 0,1,2
0,0,0
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
298,1,0
299,1,0
300,1,0
301,1,0


In [40]:
thal_dummies = pd.get_dummies(data['thal'], drop_first = True)
thal_dummies

Unnamed: 0,1,2,3
0,1,0,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
...,...,...,...
298,0,0,1
299,0,0,1
300,0,0,1
301,0,0,1


## Adding dummies

In [120]:
data_with_dummies = data.copy()

In [121]:
data_with_dummies =pd.concat([data_with_dummies,cp_type,slope_dummies,thal_dummies],axis=1)
data_with_dummies

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,thal,target,1,2,3,1.1,2.1,1.2,2.2,3.1
0,63,1,3,145,233,1,0,150,0,2.3,...,1,1,0,0,1,0,0,1,0,0
1,37,1,2,130,250,0,1,187,0,3.5,...,2,1,0,1,0,0,0,0,1,0
2,41,0,1,130,204,0,0,172,0,1.4,...,2,1,1,0,0,0,1,0,1,0
3,56,1,1,120,236,0,1,178,0,0.8,...,2,1,1,0,0,0,1,0,1,0
4,57,0,0,120,354,0,1,163,1,0.6,...,2,1,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,...,3,0,0,0,0,1,0,0,0,1
299,45,1,3,110,264,0,1,132,0,1.2,...,3,0,0,0,1,1,0,0,0,1
300,68,1,0,144,193,1,1,141,0,3.4,...,3,0,0,0,0,1,0,0,0,1
301,57,1,0,130,131,0,1,115,1,1.2,...,3,0,0,0,0,1,0,0,0,1


## Renaming dummy variables

In [122]:
data_with_dummies.columns.values

array(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
       'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target', 1,
       2, 3, 1, 2, 1, 2, 3], dtype=object)

In [123]:
new_columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
       'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal','target', 'cp1',
       'cp2', 'cp3', 'slope1', 'slope2', 'thal1', 'thal2', 'thal3']
data_with_dummies.columns = new_columns
data_with_dummies

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,thal,target,cp1,cp2,cp3,slope1,slope2,thal1,thal2,thal3
0,63,1,3,145,233,1,0,150,0,2.3,...,1,1,0,0,1,0,0,1,0,0
1,37,1,2,130,250,0,1,187,0,3.5,...,2,1,0,1,0,0,0,0,1,0
2,41,0,1,130,204,0,0,172,0,1.4,...,2,1,1,0,0,0,1,0,1,0
3,56,1,1,120,236,0,1,178,0,0.8,...,2,1,1,0,0,0,1,0,1,0
4,57,0,0,120,354,0,1,163,1,0.6,...,2,1,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,...,3,0,0,0,0,1,0,0,0,1
299,45,1,3,110,264,0,1,132,0,1.2,...,3,0,0,0,1,1,0,0,0,1
300,68,1,0,144,193,1,1,141,0,3.4,...,3,0,0,0,0,1,0,0,0,1
301,57,1,0,130,131,0,1,115,1,1.2,...,3,0,0,0,0,1,0,0,0,1


## Dropping old variables

In [124]:
data_with_dummies = data_with_dummies.drop('cp', axis = 1)
data_with_dummies = data_with_dummies.drop('slope', axis = 1)
data_with_dummies = data_with_dummies.drop('thal', axis = 1)
data_with_dummies

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,target,cp1,cp2,cp3,slope1,slope2,thal1,thal2,thal3
0,63,1,145,233,1,0,150,0,2.3,0,1,0,0,1,0,0,1,0,0
1,37,1,130,250,0,1,187,0,3.5,0,1,0,1,0,0,0,0,1,0
2,41,0,130,204,0,0,172,0,1.4,0,1,1,0,0,0,1,0,1,0
3,56,1,120,236,0,1,178,0,0.8,0,1,1,0,0,0,1,0,1,0
4,57,0,120,354,0,1,163,1,0.6,0,1,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,0,1,123,1,0.2,0,0,0,0,0,1,0,0,0,1
299,45,1,110,264,0,1,132,0,1.2,0,0,0,0,1,1,0,0,0,1
300,68,1,144,193,1,1,141,0,3.4,2,0,0,0,0,1,0,0,0,1
301,57,1,130,131,0,1,115,1,1.2,1,0,0,0,0,1,0,0,0,1


## Rearranging target feature

In [125]:
data_with_dummies.columns.values

array(['age', 'sex', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'ca', 'target', 'cp1', 'cp2', 'cp3', 'slope1',
       'slope2', 'thal1', 'thal2', 'thal3'], dtype=object)

In [126]:
cols = ['age', 'sex', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'ca','cp1', 'cp2', 'cp3', 'slope1',
       'slope2', 'thal1', 'thal2', 'thal3', 'target']
data_with_dummies = data_with_dummies[cols]

In [127]:
data_with_dummies

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,cp1,cp2,cp3,slope1,slope2,thal1,thal2,thal3,target
0,63,1,145,233,1,0,150,0,2.3,0,0,0,1,0,0,1,0,0,1
1,37,1,130,250,0,1,187,0,3.5,0,0,1,0,0,0,0,1,0,1
2,41,0,130,204,0,0,172,0,1.4,0,1,0,0,0,1,0,1,0,1
3,56,1,120,236,0,1,178,0,0.8,0,1,0,0,0,1,0,1,0,1
4,57,0,120,354,0,1,163,1,0.6,0,0,0,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,140,241,0,1,123,1,0.2,0,0,0,0,1,0,0,0,1,0
299,45,1,110,264,0,1,132,0,1.2,0,0,0,1,1,0,0,0,1,0
300,68,1,144,193,1,1,141,0,3.4,2,0,0,0,1,0,0,0,1,0
301,57,1,130,131,0,1,115,1,1.2,1,0,0,0,1,0,0,0,1,0


## Saving the preprocessed file as csv

In [128]:
data_with_dummies.to_csv('heart_disease_preprocessed',index = False)

## Creating inputs and targets for model

In [129]:
data_preprocessed = data_with_dummies

In [130]:
inputs = data_preprocessed.iloc[:,1:-1]
inputs

Unnamed: 0,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,ca,cp1,cp2,cp3,slope1,slope2,thal1,thal2,thal3
0,1,145,233,1,0,150,0,2.3,0,0,0,1,0,0,1,0,0
1,1,130,250,0,1,187,0,3.5,0,0,1,0,0,0,0,1,0
2,0,130,204,0,0,172,0,1.4,0,1,0,0,0,1,0,1,0
3,1,120,236,0,1,178,0,0.8,0,1,0,0,0,1,0,1,0
4,0,120,354,0,1,163,1,0.6,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0,140,241,0,1,123,1,0.2,0,0,0,0,1,0,0,0,1
299,1,110,264,0,1,132,0,1.2,0,0,0,1,1,0,0,0,1
300,1,144,193,1,1,141,0,3.4,2,0,0,0,1,0,0,0,1
301,1,130,131,0,1,115,1,1.2,1,0,0,0,1,0,0,0,1


In [131]:
targets = data_preprocessed.iloc[:,-1]
targets

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

## Scaling the inputs

In [132]:
scaler = StandardScaler()

In [133]:
scaler.fit(inputs)
scaled_inputs = scaler.transform(inputs)

In [134]:
scaled_inputs

array([[ 0.68100522,  0.76395577, -0.25633371, ...,  3.97911213,
        -1.10076284, -0.79311554],
       [ 0.68100522, -0.09273778,  0.07219949, ..., -0.25131234,
         0.9084609 , -0.79311554],
       [-1.46841752, -0.09273778, -0.81677269, ..., -0.25131234,
         0.9084609 , -0.79311554],
       ...,
       [ 0.68100522,  0.70684287, -1.029353  , ..., -0.25131234,
        -1.10076284,  1.26085034],
       [ 0.68100522, -0.09273778, -2.2275329 , ..., -0.25131234,
        -1.10076284,  1.26085034],
       [-1.46841752, -0.09273778, -0.19835726, ..., -0.25131234,
         0.9084609 , -0.79311554]])

## Creating test train split

In [135]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs,targets)

## Creating logistic regression with SK learn

In [151]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [152]:
reg = LogisticRegression()

In [153]:
reg.fit(x_train,y_train)

LogisticRegression()

In [154]:
reg.score(x_train,y_train)

0.9074889867841409

In [155]:
reg.intercept_

array([0.22869689])

In [156]:
reg.coef_

array([[-0.64986622, -0.32227397, -0.17347065,  0.13898823,  0.28170891,
         0.28390157, -0.46552469, -0.75731012, -1.01075243,  0.1670437 ,
         1.21648575,  0.63563865, -0.09832364,  0.47774704,  0.47827789,
         0.54145361, -0.36726529]])

In [172]:
predicted_probability = reg.predict_proba(x_train)
predicted_probability[:,1]

array([8.36040756e-01, 9.95242180e-01, 9.68128914e-01, 7.60604099e-01,
       9.69455757e-01, 6.54350974e-03, 9.77448386e-01, 2.82858091e-03,
       1.88186433e-01, 3.52333057e-02, 9.23030454e-01, 3.87437741e-02,
       1.35605945e-03, 8.31970604e-03, 1.03589722e-01, 9.98639285e-01,
       1.15782929e-01, 7.16598859e-01, 3.37305455e-03, 5.57685969e-01,
       4.99636041e-02, 6.63472213e-01, 7.46760139e-01, 1.67035918e-01,
       9.13057979e-01, 9.58485861e-01, 2.94371978e-02, 9.89483036e-01,
       6.73149920e-03, 8.77300765e-01, 2.85124464e-02, 4.36765810e-01,
       9.95934516e-01, 7.75381748e-02, 9.41912397e-01, 2.22820821e-01,
       2.85875078e-02, 2.36308903e-03, 9.93390932e-01, 2.84881526e-01,
       8.62347388e-01, 6.40196746e-01, 4.60212672e-01, 7.27186532e-01,
       2.03227945e-01, 7.88746487e-01, 9.64639332e-01, 8.58645783e-01,
       9.90034820e-01, 1.17570865e-03, 9.02746298e-01, 8.80603769e-01,
       1.86213176e-02, 5.47660100e-01, 9.52750810e-01, 3.04582522e-01,
      

## Creating Summary Table

In [157]:
summary_table = pd.DataFrame(columns = ['Features'], data = inputs.columns.values)

In [158]:
summary_table['coefficients'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Features,coefficients
0,sex,-0.649866
1,trestbps,-0.322274
2,chol,-0.173471
3,fbs,0.138988
4,restecg,0.281709
5,thalach,0.283902
6,exang,-0.465525
7,oldpeak,-0.75731
8,ca,-1.010752
9,cp1,0.167044


In [159]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['intercept',reg.intercept_[0]]
summary_table = summary_table.sort_index()

In [160]:
summary_table

Unnamed: 0,Features,coefficients
0,intercept,0.228697
1,sex,-0.649866
2,trestbps,-0.322274
3,chol,-0.173471
4,fbs,0.138988
5,restecg,0.281709
6,thalach,0.283902
7,exang,-0.465525
8,oldpeak,-0.75731
9,ca,-1.010752


In [161]:
summary_table['odds ratio'] = np.exp(summary_table.coefficients)
summary_table

Unnamed: 0,Features,coefficients,odds ratio
0,intercept,0.228697,1.256961
1,sex,-0.649866,0.522116
2,trestbps,-0.322274,0.7245
3,chol,-0.173471,0.840742
4,fbs,0.138988,1.149111
5,restecg,0.281709,1.325393
6,thalach,0.283902,1.328302
7,exang,-0.465525,0.627806
8,oldpeak,-0.75731,0.468926
9,ca,-1.010752,0.363945


In [162]:
summary_table.sort_values('odds ratio', ascending = False)

Unnamed: 0,Features,coefficients,odds ratio
11,cp2,1.216486,3.375305
12,cp3,0.635639,1.888228
16,thal2,0.541454,1.718503
15,thal1,0.478278,1.613294
14,slope2,0.477747,1.612438
6,thalach,0.283902,1.328302
5,restecg,0.281709,1.325393
0,intercept,0.228697,1.256961
10,cp1,0.167044,1.181806
4,fbs,0.138988,1.149111


## Testing the model

In [163]:
reg.score(x_test,y_test)

0.7763157894736842

In [167]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[8.21916065e-01, 1.78083935e-01],
       [9.97645637e-01, 2.35436251e-03],
       [5.64615913e-01, 4.35384087e-01],
       [5.00073035e-01, 4.99926965e-01],
       [9.38762586e-01, 6.12374140e-02],
       [5.13244723e-01, 4.86755277e-01],
       [9.95236590e-01, 4.76340958e-03],
       [9.22518553e-01, 7.74814475e-02],
       [1.36246580e-02, 9.86375342e-01],
       [3.86060465e-01, 6.13939535e-01],
       [9.78723272e-01, 2.12767285e-02],
       [9.68440738e-01, 3.15592622e-02],
       [1.15483021e-01, 8.84516979e-01],
       [2.03637505e-03, 9.97963625e-01],
       [9.93417154e-01, 6.58284585e-03],
       [5.75305530e-03, 9.94246945e-01],
       [2.09533796e-01, 7.90466204e-01],
       [9.94442383e-01, 5.55761714e-03],
       [7.12273859e-01, 2.87726141e-01],
       [8.89552680e-01, 1.10447320e-01],
       [2.98926540e-01, 7.01073460e-01],
       [8.48339414e-02, 9.15166059e-01],
       [1.49800022e-01, 8.50199978e-01],
       [5.28123938e-02, 9.47187606e-01],
       [9.997824

## Saving the model using Pickle

In [176]:
import pickle

In [177]:
with open('heart_disease_model','wb') as file:
    pickle.dump(reg,file)
    
with open('scaler','wb') as file:
    pickle.dump(scaler,file)