In [34]:
import numpy as np

fheader = "headernames.data"
fdata = [
         "cleveland.data",
         "hungarian.data",
         "long-beach-va.data",
         "switzerland.data",
        ]

with open(fheader, 'r', encoding='ascii') as f:
    headers = [word for line in f for word in line.split()]

len(headers)

76

In [35]:
import pandas as pd

data_processed = []

for i in range(len(fdata)):
    with open(fdata[i], 'r', encoding='ascii') as f:
        values = [word for line in f for word in line.split()]
        data_processed.append(np.reshape(np.array(values), (-1, 76)))
    n = np.shape(data_processed[i])[0]
    data_processed[i] = np.column_stack((np.repeat(i, n), data_processed[i]))

for d in data_processed:
    print(np.shape(d))
    
data_processed = np.vstack(data_processed)

frame = pd.DataFrame.from_records(data_processed, columns=np.array(['dataset'] + headers))
frame

(282, 77)
(294, 77)
(200, 77)
(123, 77)


Unnamed: 0,dataset,id,ccf,age,sex,painloc,painexer,relrest,pncaden,cp,...,rcaprox,rcadist,lvx1,lvx2,lvx3,lvx4,lvf,cathef,junk,name
0,0,1,0,63,1,-9,-9,-9,-9,1,...,1,1,1,1,1,1,1,-9,-9,name
1,0,2,0,67,1,-9,-9,-9,-9,4,...,1,1,1,1,1,1,1,-9,-9,name
2,0,3,0,67,1,-9,-9,-9,-9,4,...,2,2,1,1,1,7,3,-9,-9,name
3,0,4,0,37,1,-9,-9,-9,-9,3,...,1,1,1,1,1,1,1,-9,-9,name
4,0,6,0,41,0,-9,-9,-9,-9,2,...,1,1,1,1,1,1,1,-9,-9,name
5,0,7,0,56,1,-9,-9,-9,-9,2,...,1,1,1,1,1,1,1,-9,-9,name
6,0,8,0,62,0,-9,-9,-9,-9,4,...,2,1,1,1,1,7,1,-9,-9,name
7,0,9,0,57,0,-9,-9,-9,-9,4,...,1,1,1,1,1,1,1,-9,-9,name
8,0,10,0,63,1,-9,-9,-9,-9,4,...,2,1,1,1,6,7,2,-9,-9,name
9,0,11,0,53,1,-9,-9,-9,-9,4,...,2,1,1,1,1,1,1,-9,-9,name


In [36]:
# delete names
del frame['name']

In [37]:
# convert to numbers
frame = frame.applymap(float)
# responses
Y = frame['num'].map(lambda x: 1 if x != 0 else 0)

# delete "garbage columns" that only have one value for the healthy
# as they probably just describe the response

#for k in frame.keys():
#    if frame[k][Y == 0].nunique() == 1:
#        print('frame ' + k)
#        del frame[k]

In [38]:
# break down categorical variables
flatten = lambda x: 1 if x else 0
factors = ['cp', 'restecg', 'proto', 'slope', 'restwm', 'thal', 'dataset']

for factor in factors:
    if factor in frame:
        for value in frame[factor].unique():
            frame[factor + str(int(value))] = (frame[factor] == value).map(flatten)
        del frame[factor]
    

In [39]:
problematic_cols = [k for k in frame.keys() if any(frame[k] == -9)]
print('Formerly problematic:', problematic_cols)

# fill in invalid data with the mean
for k in frame.keys():
    if frame[k].isin([-9])[0]:
        frame[k + '_invalid'] = frame[k].map(lambda x: 1 if x == -9 else 0)
    
problematic_cols = [k for k in frame.keys() if any(frame[k] == -9)]
print('Now problematic:', problematic_cols)

Formerly problematic: ['painloc', 'painexer', 'relrest', 'pncaden', 'trestbps', 'htn', 'chol', 'smoke', 'cigs', 'years', 'fbs', 'dm', 'famhist', 'ekgmo', 'ekgday', 'ekgyr', 'dig', 'prop', 'nitr', 'pro', 'diuretic', 'thaldur', 'thaltime', 'met', 'thalach', 'thalrest', 'tpeakbps', 'tpeakbpd', 'dummy', 'trestbpd', 'exang', 'xhypo', 'oldpeak', 'rldv5', 'rldv5e', 'ca', 'restckm', 'exerckm', 'restef', 'exeref', 'exerwm', 'thalsev', 'thalpul', 'earlobe', 'cmo', 'cday', 'cyr', 'lmt', 'ladprox', 'laddist', 'diag', 'cxmain', 'ramus', 'om1', 'om2', 'rcaprox', 'rcadist', 'lvx1', 'lvx2', 'lvx3', 'lvx4', 'lvf', 'cathef', 'junk']
Now problematic: ['painloc', 'painexer', 'relrest', 'pncaden', 'trestbps', 'htn', 'chol', 'smoke', 'cigs', 'years', 'fbs', 'dm', 'famhist', 'ekgmo', 'ekgday', 'ekgyr', 'dig', 'prop', 'nitr', 'pro', 'diuretic', 'thaldur', 'thaltime', 'met', 'thalach', 'thalrest', 'tpeakbps', 'tpeakbpd', 'dummy', 'trestbpd', 'exang', 'xhypo', 'oldpeak', 'rldv5', 'rldv5e', 'ca', 'restckm', 'exe

In [40]:
frame.to_csv('./229_processed_cleveland_full.data')

In [41]:
Y.to_csv('./229_processed_cleveland_Y_full.data')

In [42]:
Y

0      0
1      1
2      1
3      0
4      0
5      0
6      1
7      0
8      1
9      1
10     0
11     0
12     1
13     0
14     0
15     0
16     1
17     0
18     0
19     0
20     0
21     0
22     1
23     1
24     1
25     0
26     0
27     0
28     0
29     1
      ..
869    1
870    1
871    1
872    1
873    1
874    1
875    1
876    1
877    1
878    1
879    1
880    1
881    1
882    1
883    1
884    1
885    1
886    1
887    1
888    1
889    1
890    1
891    1
892    1
893    1
894    1
895    1
896    1
897    1
898    1
Name: num, Length: 899, dtype: int64

'cleveland.data'