In [1]:
#Importing required libraries
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, jaccard_score
import seaborn as sns
import importlib
from statsmodels.stats.outliers_influence import variance_inflation_factor
from patsy import dmatrices

import cross_vals as cv




In [None]:
importlib.reload(cv)

In [2]:
df = pd.read_csv('heart.csv')

In [None]:
df

The columns represent:
1) Age
2) Sex
3) Chest Pain Type (4 values)
4) Resting Blood Pressure
5) Serum Cholestoral in mg/dl
6) Fasting blood sugar > 120 mg/dl
7) Resting electrocardiagraphic results (values 0,1,2)
8) maximum heart rate achieved
9) Exercise induced angina
10) oldpeak: ST depression induced by exercise relative to rest
11) The slope of the peak exercise ST segment
12) number of major vessels (0-3) colored by flourosopy
13) thal: 0 = normal; 1 = fixed defect; 2 = reversable defect

# Determine Correlation between predictor and response

In [None]:
df.corr()

In [None]:
df.corrwith(df['target'])

Thus, we expect that the most significant factors in our model will be:
* Chest Pain type
* maximum heart rate achieved (thalach)
* Exercise induced angina (exang)
* ST depression induced by exercise relative to rest (oldpeak)
* Number of major vessels colored by flourosopy

The factors that will have some effect:
* Slope of the peak exercise ST segment
* thal
* Age
* Sex

The factors unlikely to be significant:
* Resting Blood Pressure
* Serum cholesterol
* Resting electrocardiographic results

## Check Data Balance

In [None]:
df['target'].value_counts()

# EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.sex.value_counts()

In [None]:
df.cp.value_counts()

In [None]:
corrs = df.corr()
sns.heatmap(corrs, annot=False, vmin=-1, vmax=1

# Use VIF to examine collinearity

In [None]:
of_int = df.columns.to_list()
of_int.remove('target')
features = "+".join(of_int)


# Make design matrix for regression
y, X = dmatrices('target ~' + features, df, return_type='dataframe')

vif = pd.DataFrame()
vif['VIF_factor'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['features'] = X.columns

In [None]:
vif.round(2)

# High Correlation

In [13]:
high_cor_df = df[['cp', 'thalach', 'exang', 'oldpeak', 'ca', 'target']]
df_dummies = pd.get_dummies(high_cor_df, columns=['cp'])

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


# Medium Correlation

In [15]:
med_cor_df = df[['cp', 'thalach', 'exang', 'oldpeak', 'ca', 'slope', 'thal', 'age', 'sex', 'target']]
df_dummies = pd.get_dummies(med_cor_df, columns=['cp', 'sex'])
df_dummies

Unnamed: 0,thalach,exang,oldpeak,ca,slope,thal,age,cp_0,cp_1,cp_2,cp_3,sex_0,sex_1
0,168,0,1.0,2,2,3,52,1,0,0,0,0,1
1,155,1,3.1,0,0,3,53,1,0,0,0,0,1
2,125,1,2.6,0,0,3,70,1,0,0,0,0,1
3,161,0,0.0,1,2,3,61,1,0,0,0,0,1
4,106,0,1.9,3,1,2,62,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,164,1,0.0,0,2,2,59,0,1,0,0,0,1
1021,141,1,2.8,1,1,3,60,1,0,0,0,0,1
1022,118,1,1.0,1,1,2,47,1,0,0,0,0,1
1023,159,0,0.0,0,2,2,50,1,0,0,0,1,0


# Low Correlation

In [None]:
df_dummies = pd.get_dummies(df, columns=['cp', 'sex', 'restecg', 'target'])

In [None]:
cv.perform_cross_validation(5, df)

In [None]:
cv.perform_cross_validation(5, df)

In [None]:
from sklearn.model_selection import cross_val_score

kf = KFold(n_splits=5, random_state=None)
model = LogisticRegression(solver= 'liblinear')
result = cross_val_score(model , X, y, cv = kf)

print("Avg accuracy: {}".format(result.mean()))

In [None]:
y_true = np.array([[0, 1, 1],
                    [1, 1, 0]])
y_pred = np.array([[1, 1, 1],
                   [1, 0, 0]])
print(accuracy_score(y_true, y_pred))
print(accuracy_score(y_true, y_pred, normalize=False))

In [None]:
jaccard_score(y_true[0], y_pred[0])

In [None]:
jaccard_score(y_true, y_pred, average=None)

In [None]:
print(accuracy_score(y_true[0], y_pred[0]))

In [None]:
cv.perform_MPCV(10, df)

In [None]:
perform_cross_validation(10, df)

In [5]:
df_dummies = pd.get_dummies(df, columns=['sex'])
df_dummies

Unnamed: 0,age,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,sex_0,sex_1
0,52,0,125,212,0,1,168,0,1.0,2,2,3,0,0,1
1,53,0,140,203,1,0,155,1,3.1,0,0,3,0,0,1
2,70,0,145,174,0,1,125,1,2.6,0,0,3,0,0,1
3,61,0,148,203,0,1,161,0,0.0,2,1,3,0,0,1
4,62,0,138,294,1,1,106,0,1.9,1,3,2,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,140,221,0,1,164,1,0.0,2,0,2,1,0,1
1021,60,0,125,258,0,0,141,1,2.8,1,1,3,0,0,1
1022,47,0,110,275,0,0,118,1,1.0,1,1,2,0,0,1
1023,50,0,110,254,0,0,159,0,0.0,2,0,2,1,1,0
