In [34]:
import numpy as np

fheader = "headernames.data"
fdata = [
         "cleveland.data",
         "hungarian.data",
         "long-beach-va.data",
         "switzerland.data",
        ]

with open(fheader, 'r', encoding='ascii') as f:
    headers = [word for line in f for word in line.split()]

len(headers)

76

In [35]:
import pandas as pd

data_processed = []

for i in range(len(fdata)):
    with open(fdata[i], 'r', encoding='ascii') as f:
        values = [word for line in f for word in line.split()]
        data_processed.append(np.reshape(np.array(values), (-1, 76)))
    n = np.shape(data_processed[i])[0]
    data_processed[i] = np.column_stack((np.repeat(i, n), data_processed[i]))

for d in data_processed:
    print(np.shape(d))
    
data_processed = np.vstack(data_processed)

frame = pd.DataFrame.from_records(data_processed, columns=np.array(['dataset'] + headers))

age_series = frame['age']
sex_series = frame['sex']
# Added in two features, one corresponding to sex and one to age.
# Both are aggregate features corresponding to percentage frequency
# of coronary heart disease
sex_feature = [(7.2 if x == "1" else 4.3) for x in sex_series]
age_feature = []
for age in age_series:
    num = int(age)
    if (num < 18):
        age_feature.append(0)
    elif (num < 45):
        age_feature.append(0.8)
    elif (num < 65):
        age_feature.append(6.1)
    elif (num < 75):
        age_feature.append(16.4)
    else:
        age_feature.append(23.3)
frame['age_feature'] = age_feature
frame['sex_feature'] = sex_feature
frame

(282, 77)
(294, 77)
(200, 77)
(123, 77)


Unnamed: 0,dataset,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,...,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name,age_feature,sex_feature
0,0,1,0,63,1,-9,-9,-9,-9,1,...,1,1,1,1,1,-9,-9,name,6.1,7.2
1,0,2,0,67,1,-9,-9,-9,-9,4,...,1,1,1,1,1,-9,-9,name,16.4,7.2
2,0,3,0,67,1,-9,-9,-9,-9,4,...,1,1,1,7,3,-9,-9,name,16.4,7.2
3,0,4,0,37,1,-9,-9,-9,-9,3,...,1,1,1,1,1,-9,-9,name,0.8,7.2
4,0,6,0,41,0,-9,-9,-9,-9,2,...,1,1,1,1,1,-9,-9,name,0.8,4.3
5,0,7,0,56,1,-9,-9,-9,-9,2,...,1,1,1,1,1,-9,-9,name,6.1,7.2
6,0,8,0,62,0,-9,-9,-9,-9,4,...,1,1,1,7,1,-9,-9,name,6.1,4.3
7,0,9,0,57,0,-9,-9,-9,-9,4,...,1,1,1,1,1,-9,-9,name,6.1,4.3
8,0,10,0,63,1,-9,-9,-9,-9,4,...,1,1,6,7,2,-9,-9,name,6.1,7.2
9,0,11,0,53,1,-9,-9,-9,-9,4,...,1,1,1,1,1,-9,-9,name,6.1,7.2


In [36]:
# delete names
del frame['name']

In [37]:
# convert to numbers
frame = frame.applymap(float)
# responses
# Y = frame['num'].map(lambda x: 1 if x != 0 else 0)

# delete "garbage columns" that only have one value for the healthy
# as they probably just describe the response

#for k in frame.keys():
#    if frame[k][Y == 0].nunique() == 1:
#        print('frame ' + k)
#        del frame[k]

In [38]:
# break down categorical variables
flatten = lambda x: 1 if x else 0
factors = ['cp', 'restecg', 'proto', 'slope', 'restwm', 'thal', 'dataset']

for factor in factors:
    if factor in frame:
        for value in frame[factor].unique():
            frame[factor + str(int(value))] = (frame[factor] == value).map(flatten)
        del frame[factor]

In [39]:
problematic_cols = [k for k in frame.keys() if any(frame[k] == -9)]
print('Formerly problematic:', problematic_cols)

# fill in invalid data with the mean
for k in frame.keys():
    if frame[k].isin([-9])[0]:
        frame[k + '_invalid'] = frame[k].map(lambda x: 1 if x == -9 else 0)
    
problematic_cols = [k for k in frame.keys() if any(frame[k] == -9)]
print('Now problematic:', problematic_cols)

Formerly problematic: ['painloc', 'painexer', 'relrest', 'pncaden', 'trestbps', 'htn', 'chol', 'smoke', 'cigs', 'years', 'fbs', 'dm', 'famhist', 'ekgmo', 'ekgday', 'ekgyr', 'dig', 'prop', 'nitr', 'pro', 'diuretic', 'thaldur', 'thaltime', 'met', 'thalach', 'thalrest', 'tpeakbps', 'tpeakbpd', 'dummy', 'trestbpd', 'exang', 'xhypo', 'oldpeak', 'rldv5', 'rldv5e', 'ca', 'restckm', 'exerckm', 'restef', 'exeref', 'exerwm', 'thalsev', 'thalpul', 'earlobe', 'cmo', 'cday', 'cyr', 'lmt', 'ladprox', 'laddist', 'diag', 'cxmain', 'ramus', 'om1', 'om2', 'rcaprox', 'rcadist', 'lvx1', 'lvx2', 'lvx3', 'lvx4', 'lvf', 'cathef', 'junk']
Now problematic: ['painloc', 'painexer', 'relrest', 'pncaden', 'trestbps', 'htn', 'chol', 'smoke', 'cigs', 'years', 'fbs', 'dm', 'famhist', 'ekgmo', 'ekgday', 'ekgyr', 'dig', 'prop', 'nitr', 'pro', 'diuretic', 'thaldur', 'thaltime', 'met', 'thalach', 'thalrest', 'tpeakbps', 'tpeakbpd', 'dummy', 'trestbpd', 'exang', 'xhypo', 'oldpeak', 'rldv5', 'rldv5e', 'ca', 'restckm', 'exe

In [40]:
frame.to_csv('./229_processed_full.data')

In [41]:
Y.to_csv('./229_processed_Y_full.data')

NameError: name 'Y' is not defined

In [42]:
Y

NameError: name 'Y' is not defined

In [43]:
N = len(Y)
test_i = np.random.permutation(np.arange(N))[0:(N//6)]
train_i = np.random.permutation(np.arange(N))[(N//6):]

test_frame = frame[frame.index.isin(test_i)]
train_frame = frame[frame.index.isin(train_i)]

Y_test_frame = Y[frame.index.isin(test_i)]
Y_train_frame = Y[frame.index.isin(train_i)]

NameError: name 'Y' is not defined

In [44]:
test_frame.to_csv('./new_test_X.data')
train_frame.to_csv('./new_train_X.data')

Y_test_frame.to_csv('./new_test_Y.data')
Y_train_frame.to_csv('./new_train_Y.data')

NameError: name 'test_frame' is not defined

In [45]:
Y_train_frame

NameError: name 'Y_train_frame' is not defined