In [None]:
'''About this dataset
Age : Age of the patient

Sex : Sex of the patient

exang: exercise induced angina (1 = yes; 0 = no)

ca: number of major vessels (0-3)

cp : Chest Pain type chest pain type

Value 1: typical angina
Value 2: atypical angina
Value 3: non-anginal pain
Value 4: asymptomatic
trtbps : resting blood pressure (in mm Hg)

chol : cholestoral in mg/dl fetched via BMI sensor

fbs : (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)

rest_ecg : resting electrocardiographic results

Value 0: normal
Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
thalach : maximum heart rate achieved

target : 0= less chance of heart attack 1= more chance of heart attack

age - age in years
sex - sex (1 = male; 0 = female)
cp - chest pain type (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 4 = asymptomatic)
trestbps - resting blood pressure (in mm Hg on admission to the hospital)
chol - serum cholestoral in mg/dl
fbs - fasting blood sugar > 120 mg/dl (1 = true; 0 = false)
restecg - resting electrocardiographic results (0 = normal; 1 = having ST-T; 2 = hypertrophy)
thalach - maximum heart rate achieved
exang - exercise induced angina (1 = yes; 0 = no)
oldpeak - ST depression induced by exercise relative to rest
slope - the slope of the peak exercise ST segment (1 = upsloping; 2 = flat; 3 = downsloping)
ca - number of major vessels (0-3) colored by flourosopy
thal - 3 = normal; 6 = fixed defect; 7 = reversable defect
num - the predicted attribute - diagnosis of heart disease (angiographic disease status) 
(Value 0 = < 50% diameter narrowing; Value 1 = > 50% diameter narrowing)

'''

In [3]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
#solvers default:'ibfgs','newton-cg','liblinear','sag','saga'
import matplotlib.pyplot as plt
import matplotlib
plt.style.use('ggplot')

# read the data
df = pd.read_csv('heart.xls')
small_df = df[['age', 'sex', 'output']]
arr = df[['age', 'sex', 'cp', 'output']].values
#look at the data
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
df.dtypes

age           int64
sex           int64
cp            int64
trtbps        int64
chol          int64
fbs           int64
restecg       int64
thalachh      int64
exng          int64
oldpeak     float64
slp           int64
caa           int64
thall         int64
output        int64
dtype: object

In [4]:
#let's see if there's missing data 
for col in df.columns:
    missing= np.mean(df[col].isnull())
    print('{}  -  {}%'.format(col,missing))

age  -  0.0%
sex  -  0.0%
cp  -  0.0%
trtbps  -  0.0%
chol  -  0.0%
fbs  -  0.0%
restecg  -  0.0%
thalachh  -  0.0%
exng  -  0.0%
oldpeak  -  0.0%
slp  -  0.0%
caa  -  0.0%
thall  -  0.0%
output  -  0.0%


In [7]:
df.sort_values(by=['age'], ascending= False)

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
238,77,1,0,125,304,0,0,162,1,0.0,2,3,2,0
144,76,0,2,140,197,0,2,116,0,1.1,1,0,2,1
129,74,0,1,120,269,0,0,121,1,0.2,2,1,2,1
151,71,0,0,112,149,0,1,125,0,1.6,1,0,2,1
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,35,1,0,126,282,0,0,156,1,0.0,2,0,3,0
227,35,1,0,120,198,0,1,130,1,1.6,1,0,3,0
125,34,0,1,118,210,0,1,192,0,0.7,2,0,2,1
58,34,1,3,118,182,0,0,174,0,0.0,2,0,2,1


In [85]:

x = df[['age','sex','cp','trtbps','chol','fbs','restecg','thalachh','exng','oldpeak','slp','caa','thall']]
y = df['output'].values
model = LogisticRegression(solver='liblinear') #solvers default:'ibfgs','newton-cg','liblinear','sag','saga'
model.fit(x,y)

#plt.xlabel('age')
#plt.ylabel('cp')
#plt.scatter(df['age'], df['cp'])
#plt.show()

y_pred = model.predict(x)
y == y_pred



print(pd.DataFrame({"Feature":x.columns.tolist(),"Coefficients":model.coef_[0]}))
print(model.intercept_)
print((y == y_pred).sum())
print(model.score(x,y))
print(model.predict(x[:20]))
print(y[:20])


     Feature  Coefficients
0        age      0.009039
1        sex     -1.377614
2         cp      0.802448
3     trtbps     -0.015460
4       chol     -0.003290
5        fbs     -0.010099
6    restecg      0.459801
7   thalachh      0.029403
8       exng     -0.784187
9    oldpeak     -0.508439
10       slp      0.519474
11       caa     -0.729613
12     thall     -0.802394
[0.40801558]
258
0.8514851485148515
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [23]:
high_coef = df[['sex','cp','exng','oldpeak','slp','caa','thall']]

[1.6053064]
0.8415841584158416


In [None]:
from sklearn.model_selection import train_test_split

