In [50]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import pandas as pd
import numpy as np
import copy
from sklearn.utils import resample
from sklearn.preprocessing import scale, StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, LeaveOneGroupOut, LeaveOneOut
from sklearn.metrics import r2_score
from imblearn.over_sampling import SMOTE

train_values = np.loadtxt("train_values.csv", delimiter=",", dtype=object)
train_labels = np.loadtxt("train_labels.csv", delimiter=",", dtype=object)

d = np.empty(15, dtype=object)

d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9],d[10],d[11],d[12],d[13] = np.hsplit(train_values, 14)
d[0],d[14] = np.hsplit(train_labels, 2)

atributes = {}

for n,obj in enumerate(d):
    atributes[n] = obj[0][0]
    d[n] = np.squeeze(d[n])[1:]

#NOTE: the columns lable in pd.df is numeric. For a given column N, its property corresponds to atributes[N+1]
data = pd.DataFrame(data=d[1],index=d[0])
for n in range(15):
    if n < 2:
        continue
    data[n-1] = d[n]

#### The 14 features are describedbelow:

- slope_of_peak_exercise_st_segment (type: int): the slope of the peak exercise ST segment, an electrocardiography read out indicating quality of blood flow to the heart
- thal (type: categorical): results of thallium stress test measuring blood flow to the heart, with possible values normal, fixed_defect, reversible_defect
- resting_blood_pressure (type: int): resting blood pressure
- chest_pain_type (type: int): chest pain type (4 values)
- num_major_vessels (type: int): number of major vessels (0-3) colored by flourosopy
- fasting_blood_sugar_gt_120_mg_per_dl (type: binary): fasting blood sugar > 120 mg/dl
- resting_ekg_results (type: int): resting electrocardiographic results (values 0,1,2)
- serum_cholesterol_mg_per_dl (type: int): serum cholestoral in mg/dl
- oldpeak_eq_st_depression (type: float): oldpeak = ST depression induced by exercise relative to rest, a measure of abnormality in electrocardiograms
- sex (type: binary): 0: female, 1: male
- age (type: int): age in years
- max_heart_rate_achieved (type: int): maximum heart rate achieved (beats per minute)
- exercise_induced_angina (type: binary): exercise-induced chest pain (0: False, 1: True)
- heart_disease_present (type: binary): 0: heart disease not present, 1: heart disease present 

NOTE: The index in data corresponds to patient ID

In [7]:
print(atributes)
data.head()

{0: 'patient_id', 1: 'slope_of_peak_exercise_st_segment', 2: 'thal', 3: 'resting_blood_pressure', 4: 'chest_pain_type', 5: 'num_major_vessels', 6: 'fasting_blood_sugar_gt_120_mg_per_dl', 7: 'resting_ekg_results', 8: 'serum_cholesterol_mg_per_dl', 9: 'oldpeak_eq_st_depression', 10: 'sex', 11: 'age', 12: 'max_heart_rate_achieved', 13: 'exercise_induced_angina', 14: 'heart_disease_present'}


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0,0
ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0,0
yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1,1
l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0,1
oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0,0


In [18]:
# Counts for heart_disease_present in training data
data[13].value_counts()

0.0    100
1.0     80
Name: 13, dtype: int64

In [42]:
data = (data.astype({0: 'int64', 1: 'category', 2: 'int64', 3: 'int64', 4: 'int64', 5: 'int64', 6: 'int64', 7: 'int64',
                    8: 'float64', 9: 'int64', 10: 'int64', 11: 'int64', 12: 'int64', 13: 'int64'}))
print(data.dtypes)

# Average for each attribute whether or not heart_disease_present
data.groupby(13).mean()

0        int64
1     category
2        int64
3        int64
4        int64
5        int64
6        int64
7        int64
8      float64
9        int64
10       int64
11       int64
12       int64
13       int64
dtype: object


Unnamed: 0_level_0,0,2,3,4,5,6,7,8,9,10,11,12
13,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1.36,130.12,2.81,0.33,0.16,0.92,245.46,0.627,0.55,53.66,156.87,0.13
1,1.7875,132.8,3.5875,1.15,0.1625,1.2125,253.9,1.48875,0.8625,56.25,140.25,0.55


In [43]:
# Averages as separated by thalium stress test
data.groupby(1).mean()

Unnamed: 0_level_0,0,2,3,4,5,6,7,8,9,10,11,12,13
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
fixed_defect,2.0,141.375,3.125,0.625,0.375,1.25,227.25,1.3,1.0,57.875,136.0,0.25,0.5
normal,1.377551,129.77551,2.897959,0.530612,0.153061,1.071429,250.255102,0.669388,0.510204,54.387755,154.938776,0.153061,0.204082
reversible_defect,1.72973,132.256757,3.5,0.918919,0.148649,1.0,250.202703,1.42973,0.891892,55.040541,143.716216,0.540541,0.756757


In [51]:
X = data.loc[:, data.columns != 13]
y = data.loc[:, data.columns == 13]

sm = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
sm_data_X, sm_data_y = sm.fit_sample(X_train, y_train)
sm_data_X = pd.DataFrame(data = sm_data_X, columns = columns )
sm_data_y= pd.DataFrame(data = sm_data_y, columns=['y'])


ValueError: could not convert string to float: 'normal'

In [52]:
logreg = LogisticRegression(multi_class = 'multionomial')

# fit the model with data
logreg.fit(X,y)
y_pred=logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: ' + logreg.score(X_test, y_test))



ValueError: could not convert string to float: 'normal'