In [3]:
#Importing required libraries
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, jaccard_score

In [4]:
df = pd.read_csv('heart.csv')

In [4]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


The columns represent:
1) Age
2) Sex
3) Chest Pain Type (4 values)
4) Resting Blood Pressure
5) Serum Cholestoral in mg/dl
6) Fasting blood sugar > 120 mg/dl
7) Resting electrocardiagraphic results (values 0,1,2)
8) maximum heart rate achieved
9) Exercise induced angina
10) oldpeak: ST depression induced by exercise relative to rest
11) The slope of the peak exercise ST segment
12) number of major vessels (0-3) colored by flourosopy
13) thal: 0 = normal; 1 = fixed defect; 2 = reversable defect

# Determine Correlation between predictor and response

In [5]:
df.corr()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
age,1.0,-0.10324,-0.071966,0.271121,0.219823,0.121243,-0.132696,-0.390227,0.088163,0.208137,-0.169105,0.271551,0.072297,-0.229324
sex,-0.10324,1.0,-0.041119,-0.078974,-0.198258,0.0272,-0.055117,-0.049365,0.139157,0.084687,-0.026666,0.111729,0.198424,-0.279501
cp,-0.071966,-0.041119,1.0,0.038177,-0.081641,0.079294,0.043581,0.306839,-0.401513,-0.174733,0.131633,-0.176206,-0.163341,0.434854
trestbps,0.271121,-0.078974,0.038177,1.0,0.127977,0.181767,-0.123794,-0.039264,0.061197,0.187434,-0.120445,0.104554,0.059276,-0.138772
chol,0.219823,-0.198258,-0.081641,0.127977,1.0,0.026917,-0.14741,-0.021772,0.067382,0.06488,-0.014248,0.074259,0.100244,-0.099966
fbs,0.121243,0.0272,0.079294,0.181767,0.026917,1.0,-0.104051,-0.008866,0.049261,0.010859,-0.061902,0.137156,-0.042177,-0.041164
restecg,-0.132696,-0.055117,0.043581,-0.123794,-0.14741,-0.104051,1.0,0.048411,-0.065606,-0.050114,0.086086,-0.078072,-0.020504,0.134468
thalach,-0.390227,-0.049365,0.306839,-0.039264,-0.021772,-0.008866,0.048411,1.0,-0.380281,-0.349796,0.395308,-0.207888,-0.098068,0.422895
exang,0.088163,0.139157,-0.401513,0.061197,0.067382,0.049261,-0.065606,-0.380281,1.0,0.310844,-0.267335,0.107849,0.197201,-0.438029
oldpeak,0.208137,0.084687,-0.174733,0.187434,0.06488,0.010859,-0.050114,-0.349796,0.310844,1.0,-0.575189,0.221816,0.202672,-0.438441


In [5]:
df.corrwith(df['target'])

age        -0.229324
sex        -0.279501
cp          0.434854
trestbps   -0.138772
chol       -0.099966
fbs        -0.041164
restecg     0.134468
thalach     0.422895
exang      -0.438029
oldpeak    -0.438441
slope       0.345512
ca         -0.382085
thal       -0.337838
target      1.000000
dtype: float64

Thus, we expect that the most significant factors in our model will be:
* Chest Pain type
* maximum heart rate achieved (thalach)
* Exercise induced angina (exang)
* ST depression induced by exercise relative to rest (oldpeak)
* Number of major vessels colored by flourosopy

The factors that will have some effect:
* Slope of the peak exercise ST segment
* thal
* Age
* Sex

The factors unlikely to be significant:
* Resting Blood Pressure
* Serum cholesterol
* Resting electrocardiographic results

# Todo: Examine collinearity

In [6]:
#Loading the dataset
#data = load_breast_cancer(as_frame = True)
#df = data.frame
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [17]:
#Implementing cross validation
def perform_cross_validation(num_folds, df):
    kf = KFold(n_splits=num_folds, random_state=None)
    model = LogisticRegression(solver= 'liblinear')

    acc_score = []
    jacc_score = []

    for train_index , test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train, y_test = y[train_index] , y[test_index]

        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)
        acc = accuracy_score(y_test, pred_values)
        jacc = jaccard_score(y_test, pred_values)
        acc_score.append(acc)
        jacc_score.append(jacc)

    avg_acc_score = sum(acc_score)/num_folds
    avg_jacc_score = sum(jacc_score)/num_folds

    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))
    print()
    print('Jaccard Score of each fold - {}'.format(jacc_score))
    print('Avg Jaccard : {}'.format(avg_jacc_score))


In [19]:
perform_cross_validation(10, df)




922
103



922
103



922
103



922
103



922
103



923
102



923
102



923
102



923
102



923
102
accuracy of each fold - [0.8640776699029126, 0.9029126213592233, 0.8640776699029126, 0.8640776699029126, 0.8349514563106796, 0.9313725490196079, 0.7941176470588235, 0.8137254901960784, 0.7843137254901961, 0.803921568627451]
Avg accuracy : 0.8457548067770798

Jaccard Score of each fold - [0.7704918032786885, 0.8360655737704918, 0.7971014492753623, 0.7666666666666667, 0.7571428571428571, 0.8703703703703703, 0.676923076923077, 0.7076923076923077, 0.6333333333333333, 0.696969696969697]
Avg Jaccard : 0.7512757135422852


In [20]:
from sklearn.model_selection import cross_val_score

kf = KFold(n_splits=5, random_state=None)
model = LogisticRegression(solver= 'liblinear')
result = cross_val_score(model , X, y, cv = kf)

print("Avg accuracy: {}".format(result.mean()))

Avg accuracy: 0.846829268292683


In [45]:
y_true = np.array([[0, 1, 1],
                    [1, 1, 0]])
y_pred = np.array([[1, 1, 1],
                   [1, 0, 0]])
print(accuracy_score(y_true, y_pred))
print(accuracy_score(y_true, y_pred, normalize=False))

0.0
0


In [46]:
jaccard_score(y_true[0], y_pred[0])

0.6666666666666666

In [47]:
jaccard_score(y_true, y_pred, average=None)

array([0.5, 0.5, 1. ])

In [48]:
print(accuracy_score(y_true[0], y_pred[0]))

0.6666666666666666


In [None]:
#Implementing cross validation
def perform_MPCV(num_folds, df):
    kf = KFold(n_splits=num_folds, random_state=None)
    model = LogisticRegression(solver= 'liblinear')

    acc_score = []
    jacc_score = []

    for test_index, train_index in kf.split(X):
        X_train, X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train, y_test = y[train_index] , y[test_index]

        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)
        acc = accuracy_score(y_test, pred_values)
        jacc = jaccard_score(y_test, pred_values)
        acc_score.append(acc)
        jacc_score.append(jacc)

    avg_acc_score = sum(acc_score)/num_folds
    avg_jacc_score = sum(jacc_score)/num_folds

    print('accuracy of each fold - {}'.format(acc_score))
    print('Avg accuracy : {}'.format(avg_acc_score))
    print()
    print('Jaccard Score of each fold - {}'.format(jacc_score))
    print('Avg Jaccard : {}'.format(avg_jacc_score))
