In [2]:
#[Load and Process the Dataset: NHANES-10000]
import numpy as np
import gzip
with gzip.GzipFile('NHANES.csv.gz','rb') as f:
    #gp from raw data into filtered and clean data--------------------------------------------------
    raw = [row.decode('utf-8').replace('\n','').replace('\r','').split(',') for row in f.readlines()]
    header,data,D = raw[0],raw[1:],[]
print(header)
print(len(data))

['SurveyYr', 'ID', 'Gender', 'Age', 'AgeDecade', 'AgeMonths', 'Race1', 'Race3', 'Education', 'MaritalStatus', 'HHIncome', 'HHIncomeMid', 'Poverty', 'HomeRooms', 'HomeOwn', 'Work', 'Weight', 'Length', 'HeadCirc', 'Height', 'BMI', 'BMICatUnder20yrs', 'BMI_WHO', 'Pulse', 'BPSysAve', 'BPDiaAve', 'BPSys1', 'BPDia1', 'BPSys2', 'BPDia2', 'BPSys3', 'BPDia3', 'Testosterone', 'DirectChol', 'TotChol', 'UrineVol1', 'UrineFlow1', 'UrineVol2', 'UrineFlow2', 'Diabetes', 'DiabetesAge', 'HealthGen', 'DaysPhysHlthBad', 'DaysMentHlthBad', 'LittleInterest', 'Depressed', 'nPregnancies', 'nBabies', 'Age1stBaby', 'SleepHrsNight', 'SleepTrouble', 'PhysActive', 'PhysActiveDays', 'TVHrsDay', 'CompHrsDay', 'TVHrsDayChild', 'CompHrsDayChild', 'Alcohol12PlusYr', 'AlcoholDay', 'AlcoholYear', 'SmokeNow', 'Smoke100', 'Smoke100n', 'SmokeAge', 'Marijuana', 'AgeFirstMarij', 'RegularMarij', 'AgeRegMarij', 'HardDrugs', 'SexEver', 'SexAge', 'SexNumPartnLife', 'SexNumPartYear', 'SameSex', 'SexOrientation']
10000


In [3]:
import numpy as np
def float_and_nan(x):
    if x=='NA': return np.nan
    else:       return float(x)

In [4]:
idx = {header[i]:i for i in range(len(header))}

ed  = sorted(set([row[idx['Education']] for row in data]))
edu = {ed[i]:i for i in range(len(ed))} #{}
edu['NA'] = np.nan

hd  = sorted(set([row[idx['HardDrugs']] for row in data]))
hdu = {hd[i]:i for i in range(len(hd))}
hdu['NA'] = np.nan

print(edu)
print(idx['Poverty'])
hdu[data[0][idx['HardDrugs']]]

{'8thGrade': 0, '9_11thGrade': 1, 'CollegeGrad': 2, 'HighSchool': 3, 'NA': nan, 'SomeCollege': 5}
12


2

In [5]:
D = []
for row in data: #only work with data rows that have all the data: no missing values
    D += [[ edu[row[idx['Education']]],float_and_nan(row[idx['Poverty']]),hdu[row[idx['HardDrugs']]]]]
D[0:20]
D = np.asarray(D,dtype=float)
D

array([[2.  , 5.  , 2.  ],
       [ nan, 4.17,  nan],
       [0.  , 1.58,  nan],
       ...,
       [5.  , 4.97, 1.  ],
       [5.  , 1.67, 2.  ],
       [2.  , 4.59,  nan]])

In [6]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [7]:
imp = IterativeImputer(max_iter=10,random_state=0) #this MICE
imp.fit(D)
X = imp.transform(D)
X[:,0] = np.round(X[:,0],0)
X[:,2] = np.round(X[:,2],0)
X[:10]

array([[2.  , 5.  , 2.  ],
       [3.  , 4.17, 1.  ],
       [0.  , 1.58, 1.  ],
       [3.  , 1.85, 2.  ],
       [3.  , 1.45, 1.  ],
       [0.  , 0.9 , 1.  ],
       [3.  , 3.49, 1.  ],
       [5.  , 0.76, 1.  ],
       [2.  , 5.  , 1.  ],
       [0.  , 1.74, 1.  ]])

In [8]:
# now map back values
udh = {hdu[h]:h for h in hdu}
ude = {edu[e]:e for e in edu}

M = [[ude[X[i][0]],(1 if X[i][0]!=D[i][0] else 0), X[i][1], (1 if X[i][1]!=D[i][1] else 0), udh[X[i][2]], (1 if X[i][2]!=D[i][2] else 0)] for i in range(len(X))]
M[0:10]

[['CollegeGrad', 0, 5.0, 0, 'Yes', 0],
 ['HighSchool', 1, 4.17, 0, 'No', 1],
 ['8thGrade', 0, 1.58, 0, 'No', 1],
 ['HighSchool', 0, 1.85, 0, 'Yes', 0],
 ['HighSchool', 1, 1.45, 0, 'No', 1],
 ['8thGrade', 0, 0.9, 0, 'No', 0],
 ['HighSchool', 1, 3.49, 0, 'No', 0],
 ['SomeCollege', 0, 0.76, 0, 'No', 0],
 ['CollegeGrad', 0, 5.0, 0, 'No', 1],
 ['8thGrade', 0, 1.74, 0, 'No', 0]]

In [13]:
#pack it up as a new
s = ','.join(['Education','Education_imp','Poverty','Poverty_imp','HardDrugs','HardDrugs_imp'])+'\n'
s += '\n'.join([','.join([str(x) for x in row]) for row in M])+'\n'
with open('nhanes.imp.csv','w') as f: f.write(s)