# Pandas Tools

#### Count up nans in a column
* df['Column_Name'].isnull().sum(axis = 0)

#### Fill all NaN values in a column with a particular value
* Demographics_df['DMDCITZN'].fillna(2.0, inplace=True)

##### Combining Dataframes

* Full_Days_Nutrition_df = pd.merge(Day_1_Nutrition_df, Day_2_Nutrition_df, on="SEQN")
* Full_Days_Nutrition_df.head()

##### Find rows that are all NaN

* nan = df[df.isnull().all(axis=1)].index
* nan

##### Change all '99' in a column to 1
Demographics_df.loc[Demographics_df['DMDBORN4'] == 99, 'DMDBORN4'] = 1

##### Check for Nans

* df.isnull().values.any()

##### Getting Dummies
* Finished_Demographics_df = pd.get_dummies(Demographics_df, columns = need_dummies, drop_first = True)

##### Rename Columns
* df = df.rename(columns={'oldName1': 'newName1', 'oldName2': 'newName2'})

##### Add 'Age', then drop all 0-17 year olds
* Blood_Pressure_df = pd.merge(Blood_Pressure_df, Age_df, on="SEQN")
* Blood_Pressure_df = Blood_Pressure_df.drop(Blood_Pressure_df[Blood_Pressure_df.Age < 18].index)
* Blood_Pressure_df.head()

##### Fill NaNs based on the median of the column

df['Col'] = df['Col'].fillna(df['Col'].median())

##### Change all values in a df that meet a condition
* in this case, change them all to NaN
* df[df > 9000] = np.NaN

##### For a single column
* df['Col'].values[df['Col'].values > 9000] = np.NaN

***

# Import Libraries

In [38]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")


***

In [4]:
# I merged onto this age_df to automatically drop everyone younger than 18

age_df = pd.read_csv('Cleaned_Ages.csv')
age_df = age_df.drop('Unnamed: 0', axis = 1)

***

<div class="alert alert-block alert-info">
    
# Questionnaire Data - Done

### Next Steps:
   * Fill in missing values
   * Get TARGET column sorted out
   
</div>

### Load the data

In [6]:
Preprocess_Q_Alcohol_Use_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Questionnaire_Alcohol_Use.XPT')
Preprocess_Q_Blood_Pressure_Cholesterol_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Blood_Pressure_Cholesterol.XPT')
Preprocess_Q_Where_Get_Food_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Where_Do_You_Get_Food.XPT')
Preprocess_Q_General_Health_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_General_Health.XPT')
Preprocess_Q_Diet_Behavior_and_Nutrition_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Diet_Behavior_and_Nutrition.XPT')
Preprocess_Q_Disabilities_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Disabilities.XPT')
Preprocess_Q_Health_Insurance_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Health_Insurance.XPT')
Preprocess_Q_Access_to_Care_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Access_to_Care.XPT')
Preprocess_Q_Housing_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Housing.XPT')
Preprocess_Q_Income_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Income.XPT')
Preprocess_Q_Medical_Conditions_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Medical_Conditions.XPT')
Preprocess_Q_Mental_Health_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Mental_Health.XPT')
Preprocess_Q_Physical_Activity_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Pyhsical_Activity.XPT')
Preprocess_Q_Sleep_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Sleep.XPT')
Preprocess_Q_Smoking_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Smoking.XPT')
Preprocess_Q_Household_Smoking_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Household_Smoking.XPT')
Preprocess_Q_Weight_History_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Weight_History.XPT')

# Not Using
# Preprocess_Q_Drug_Use_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Drug_Use.XPT')
# Preprocess_Q_Last_Dental_Visit_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Last_Dental_Visit.XPT')
# Preprocess_Q_Weight_History_Youth_df = pd.read_sas(filepath_or_buffer = 'NHANES_Questionnaire_Data/Q_Weight_History_Youth.XPT')


In [7]:
print ('Alcohol Use: ' + str(Preprocess_Q_Alcohol_Use_df.shape))
print ('Blood Pressure and Cholesterol: ' + str(Preprocess_Q_Blood_Pressure_Cholesterol_df.shape))
print ('Where You Get Food: ' + str(Preprocess_Q_Where_Get_Food_df.shape))
print ('General Health: ' + str(Preprocess_Q_General_Health_df.shape))
print ('Diet Behavior and Nutrition: ' + str(Preprocess_Q_Diet_Behavior_and_Nutrition_df.shape))
print ('Disabilities: ' + str(Preprocess_Q_Disabilities_df.shape))
print ('Health Insurance: ' + str(Preprocess_Q_Health_Insurance_df.shape))
print ('IMPORTANT!!  Access to Care: ' + str(Preprocess_Q_Access_to_Care_df.shape))
print ('Housing: ' + str(Preprocess_Q_Housing_df.shape))
print ('Income: ' + str(Preprocess_Q_Income_df.shape))
print ('Medical Conditions: ' + str(Preprocess_Q_Medical_Conditions_df.shape))
print ('Mental Health: ' + str(Preprocess_Q_Mental_Health_df.shape))
print ('Physical Activity: ' + str(Preprocess_Q_Physical_Activity_df.shape))
print ('Sleep: ' + str(Preprocess_Q_Sleep_df.shape))
print ('Smoking: ' + str(Preprocess_Q_Smoking_df.shape))
print ('Household Smoking: ' + str(Preprocess_Q_Household_Smoking_df.shape))
print ('Weight History: ' + str(Preprocess_Q_Weight_History_df.shape))

# Not Using
# print ('Drug Use: ' + str(Preprocess_Q_Drug_Use_df.shape))
# print ('Last Dental Visit: ' + str(Preprocess_Q_Last_Dental_Visit_df.shape))
# print ('Weight History Youth: ' + str(Preprocess_Q_Weight_History_Youth_df.shape))



Alcohol Use: (5735, 10)
Blood Pressure and Cholesterol: (6327, 11)
Where You Get Food: (9971, 6)
General Health: (9165, 9)
Diet Behavior and Nutrition: (9971, 51)
Disabilities: (9575, 13)
Health Insurance: (9971, 17)
IMPORTANT!!  Access to Care: (9971, 10)
Housing: (9971, 3)
Income: (9971, 16)
Medical Conditions: (9575, 90)
Mental Health: (5735, 11)
Physical Activity: (9255, 94)
Sleep: (6327, 8)
Smoking: (7001, 42)
Household Smoking: (9971, 4)
Weight History: (6327, 37)


<div class="alert alert-block alert-danger">
    
# 1. Alcohol Use - Won't be using, too many missing values
   * Ages 18+
   * 5735 Values
   * Missing Values will probably be zero
   * ALQ120Q - How often drink alcohol over past 12 mos
       * Days per year
       * 1511 missing
       * 777 and 999 -> 0
   * ALQ130 - Avg # alcoholic drinks/day - past 12 mos
       * 2356 missing
       * 777 and 999 -> 0
   * ALQ141Q - # days have 4/5 drinks - past 12 mos (enjoys Alcohol)
       * 2358 missing
       * 777 and 999 -> 0
   * ALQ151 - Ever have 4/5 or more drinks every day? (Alcoholic)
       * 1530 missing
       * 1 = Yes
       * 2 = No
       * 7 and 9 -> 2

In [9]:
alcohol_use_columns = ['SEQN', 'ALQ120Q', 'ALQ130', 'ALQ141Q', 'ALQ151']
Alcohol_Use_df = Preprocess_Q_Alcohol_Use_df[alcohol_use_columns]
Alcohol_Use_df.head()

Unnamed: 0,SEQN,ALQ120Q,ALQ130,ALQ141Q,ALQ151
0,83732.0,1.0,1.0,5.397605e-79,2.0
1,83733.0,7.0,6.0,7.0,1.0
2,83734.0,5.397605e-79,,,1.0
3,83735.0,3.0,1.0,5.397605e-79,2.0
4,83736.0,1.0,1.0,5.397605e-79,2.0


In [10]:
Alcohol_Use_df.shape

(5735, 5)

In [13]:
Alcohol_Use_df = pd.merge(Alcohol_Use_df, age_df, on="SEQN")
Alcohol_Use_df = Alcohol_Use_df.drop(Alcohol_Use_df[Alcohol_Use_df.Age < 18].index)
Alcohol_Use_df.head()

Unnamed: 0,SEQN,ALQ120Q,ALQ130,ALQ141Q,ALQ151,Age
0,83732.0,1.0,1.0,5.397605e-79,2.0,62.0
1,83733.0,7.0,6.0,7.0,1.0,53.0
2,83734.0,5.397605e-79,,,1.0,78.0
3,83735.0,3.0,1.0,5.397605e-79,2.0,56.0
4,83736.0,1.0,1.0,5.397605e-79,2.0,42.0


In [15]:
print (Alcohol_Use_df['ALQ120Q'].isnull().sum(axis = 0))
print (Alcohol_Use_df['ALQ130'].isnull().sum(axis = 0))
print (Alcohol_Use_df['ALQ141Q'].isnull().sum(axis = 0))
print (Alcohol_Use_df['ALQ151'].isnull().sum(axis = 0))

1511
2356
2358
1513


<div class="alert alert-block alert-success">
    
# 2. Blood Pressure and Cholesterol
   * Ages 16+
   * 6327 Values
   * Game Plan
        * BPQ020 - Ever told you had high blood pressure
            * 1 = Yes
            * 2 = No
            * 9 -> 0
            * 2 -> 0
        * BPQ030 - Told had high blood pressure - 2+ times
            * 1 = Yes
            * 2 = No
            * 7, 9, Nan -> 0
            * 2 -> 0
        * BPQ080 - Doctor told you - high cholesterol level
            * 1 = Yes
            * 2 = No
            * 9 -> 0
            * 2 -> 0

In [73]:
bp_cholesterol_columns = ['SEQN', 'BPQ020', 'BPQ030', 'BPQ080']
Blood_Pressure_Cholesterol_df = Preprocess_Q_Blood_Pressure_Cholesterol_df[bp_cholesterol_columns]
print ('Blood Pressure and Cholesterol Shape: ' + str(Blood_Pressure_Cholesterol_df.shape))
Blood_Pressure_Cholesterol_df.head()

Blood Pressure and Cholesterol Shape: (6327, 4)


Unnamed: 0,SEQN,BPQ020,BPQ030,BPQ080
0,83732.0,2.0,,2.0
1,83733.0,2.0,,2.0
2,83734.0,1.0,1.0,1.0
3,83735.0,2.0,,2.0
4,83736.0,2.0,,2.0


In [74]:
Blood_Pressure_Cholesterol_df = pd.merge(Blood_Pressure_Cholesterol_df, age_df, on="SEQN")
Blood_Pressure_Cholesterol_df = Blood_Pressure_Cholesterol_df.drop(Blood_Pressure_Cholesterol_df[Blood_Pressure_Cholesterol_df.Age < 18].index)
Blood_Pressure_Cholesterol_df.head()

Unnamed: 0,SEQN,BPQ020,BPQ030,BPQ080,Age
0,83732.0,2.0,,2.0,62.0
1,83733.0,2.0,,2.0,53.0
2,83734.0,1.0,1.0,1.0,78.0
3,83735.0,2.0,,2.0,56.0
4,83736.0,2.0,,2.0,42.0


In [75]:
print (Blood_Pressure_Cholesterol_df['BPQ020'].isnull().sum(axis = 0))
print (Blood_Pressure_Cholesterol_df['BPQ030'].isnull().sum(axis = 0))
print (Blood_Pressure_Cholesterol_df['BPQ080'].isnull().sum(axis = 0))

0
3911
0


In [80]:
# clean the columns based on game plan
Blood_Pressure_Cholesterol_df.loc[Blood_Pressure_Cholesterol_df['BPQ020'] == 9, 'BPQ020'] = 0
Blood_Pressure_Cholesterol_df.loc[Blood_Pressure_Cholesterol_df['BPQ020'] == 2, 'BPQ020'] = 0

Blood_Pressure_Cholesterol_df.loc[Blood_Pressure_Cholesterol_df['BPQ030'] == 7, 'BPQ030'] = 0
Blood_Pressure_Cholesterol_df.loc[Blood_Pressure_Cholesterol_df['BPQ030'] == 9, 'BPQ030'] = 0
Blood_Pressure_Cholesterol_df.loc[Blood_Pressure_Cholesterol_df['BPQ030'] == 2, 'BPQ030'] = 0
Blood_Pressure_Cholesterol_df['BPQ030'].fillna(0, inplace=True)

Blood_Pressure_Cholesterol_df.loc[Blood_Pressure_Cholesterol_df['BPQ080'] == 9, 'BPQ080'] = 0
Blood_Pressure_Cholesterol_df.loc[Blood_Pressure_Cholesterol_df['BPQ080'] == 2, 'BPQ080'] = 0

In [81]:
print (Blood_Pressure_Cholesterol_df['BPQ020'].isnull().sum(axis = 0))
print (Blood_Pressure_Cholesterol_df['BPQ030'].isnull().sum(axis = 0))
print (Blood_Pressure_Cholesterol_df['BPQ080'].isnull().sum(axis = 0))

0
0
0


In [84]:
Blood_Pressure_Cholesterol_df['BPQ080'].value_counts()

0.0    4043
1.0    1949
Name: BPQ080, dtype: int64

In [85]:
Final_BLPR_CHOL_df = Blood_Pressure_Cholesterol_df.drop('Age', axis = 1)

In [86]:
need_dummies = ['BPQ020', 'BPQ030', 'BPQ080']
Final_BLPR_CHOL_df = pd.get_dummies(Final_BLPR_CHOL_df, columns = need_dummies, drop_first = True)

In [106]:
Final_BLPR_CHOL_df.head()

Unnamed: 0,SEQN,BPQ020_1.0,BPQ030_1.0,BPQ080_1.0
0,83732.0,0,0,0
1,83733.0,0,0,0
2,83734.0,1,1,1
3,83735.0,0,0,0
4,83736.0,0,0,0


In [90]:
Final_BLPR_CHOL_df.to_csv('Cleaned_BLPR_CHOL.csv')

<div class="alert alert-block alert-success">
    
# 3. Where do you get your food?
   * Ages 0+
   * CBD071 - Money spent at supermarket/grocery store
       * 777777, 999999, Nan -> Median
   * CBD111 - Money spent on food at other stores
       * 777777, 999999, Nan -> Median
   * CBD121 - Money spent on eating out
       * 777777, 999999, Nan -> Median
   * CBD131 - Money spent on carryout/delivered foods
       * 777777, 999999, Nan -> Median

In [53]:
where_food_columns = ['SEQN', 'CBD071', 'CBD111', 'CBD121', 'CBD131']
Where_You_Get_Food_df = Preprocess_Q_Where_Get_Food_df[where_food_columns]
print ('Where do you get food Shape: ' + str(Where_You_Get_Food_df.shape))
Where_You_Get_Food_df.head()

Where do you get food Shape: (9971, 5)


Unnamed: 0,SEQN,CBD071,CBD111,CBD121,CBD131
0,83732.0,300.0,5.397605e-79,125.0,5.397605e-79
1,83733.0,200.0,50.0,40.0,5.397605e-79
2,83734.0,400.0,5.397605e-79,5.397605e-79,5.397605e-79
3,83735.0,300.0,250.0,50.0,5.397605e-79
4,83736.0,300.0,5.397605e-79,200.0,5.397605e-79


In [54]:
Where_You_Get_Food_df = pd.merge(Where_You_Get_Food_df, age_df, on="SEQN")
Where_You_Get_Food_df = Where_You_Get_Food_df.drop(Where_You_Get_Food_df[Where_You_Get_Food_df.Age < 18].index)
print ('Where do you get food Shape: ' + str(Where_You_Get_Food_df.shape))
Where_You_Get_Food_df.head()

Where do you get food Shape: (5992, 6)


Unnamed: 0,SEQN,CBD071,CBD111,CBD121,CBD131,Age
0,83732.0,300.0,5.397605e-79,125.0,5.397605e-79,62.0
1,83733.0,200.0,50.0,40.0,5.397605e-79,53.0
2,83734.0,400.0,5.397605e-79,5.397605e-79,5.397605e-79,78.0
3,83735.0,300.0,250.0,50.0,5.397605e-79,56.0
4,83736.0,300.0,5.397605e-79,200.0,5.397605e-79,42.0


In [65]:
# Change all 777777 and 999999 to NaNs
# If I don't do this, it will throw off the median

Where_You_Get_Food_df['CBD071'].values[Where_You_Get_Food_df['CBD071'].values > 9000] = np.NaN
Where_You_Get_Food_df['CBD111'].values[Where_You_Get_Food_df['CBD111'].values > 9000] = np.NaN
Where_You_Get_Food_df['CBD121'].values[Where_You_Get_Food_df['CBD121'].values > 9000] = np.NaN
Where_You_Get_Food_df['CBD131'].values[Where_You_Get_Food_df['CBD131'].values > 9000] = np.NaN

# Change all values < 1 to zero
Where_You_Get_Food_df['CBD071'].values[Where_You_Get_Food_df['CBD071'].values < 1] = 0
Where_You_Get_Food_df['CBD111'].values[Where_You_Get_Food_df['CBD111'].values < 1] = 0
Where_You_Get_Food_df['CBD121'].values[Where_You_Get_Food_df['CBD121'].values < 1] = 0
Where_You_Get_Food_df['CBD131'].values[Where_You_Get_Food_df['CBD131'].values < 1] = 0


In [66]:
# Change all NaNs to the median of the column

Where_You_Get_Food_df['CBD071'] = Where_You_Get_Food_df['CBD071'].fillna(Where_You_Get_Food_df['CBD071'].median())
Where_You_Get_Food_df['CBD111'] = Where_You_Get_Food_df['CBD111'].fillna(Where_You_Get_Food_df['CBD111'].median())
Where_You_Get_Food_df['CBD121'] = Where_You_Get_Food_df['CBD121'].fillna(Where_You_Get_Food_df['CBD121'].median())
Where_You_Get_Food_df['CBD131'] = Where_You_Get_Food_df['CBD131'].fillna(Where_You_Get_Food_df['CBD131'].median())


In [67]:
print (Where_You_Get_Food_df['CBD071'].isnull().sum(axis = 0))
print (Where_You_Get_Food_df['CBD111'].isnull().sum(axis = 0))
print (Where_You_Get_Food_df['CBD121'].isnull().sum(axis = 0))
print (Where_You_Get_Food_df['CBD131'].isnull().sum(axis = 0))

0
0
0
0


In [68]:
Where_You_Get_Food_df.describe()

Unnamed: 0,SEQN,CBD071,CBD111,CBD121,CBD131,Age
count,5992.0,5992.0,5992.0,5992.0,5992.0,5992.0
mean,88676.950935,439.751836,126.201101,146.437417,28.139019,48.121996
std,2877.819691,480.392712,184.978989,233.654952,64.026335,18.521353
min,83732.0,0.0,0.0,0.0,0.0,18.0
25%,86175.75,200.0,0.0,34.0,0.0,32.0
50%,88659.5,380.0,60.0,100.0,0.0,48.0
75%,91174.25,514.0,180.0,200.0,35.0,63.0
max,93702.0,8400.0,2142.0,8400.0,1500.0,80.0


In [69]:
Final_Where_Food_Comes = Where_You_Get_Food_df.drop('Age', axis = 1)

In [70]:
Final_Where_Food_Comes.head()

Unnamed: 0,SEQN,CBD071,CBD111,CBD121,CBD131
0,83732.0,300.0,0.0,125.0,0.0
1,83733.0,200.0,50.0,40.0,0.0
2,83734.0,400.0,0.0,0.0,0.0
3,83735.0,300.0,250.0,50.0,0.0
4,83736.0,300.0,0.0,200.0,0.0


In [71]:
Final_Where_Food_Comes.to_csv('Where_You_Get_Your_Food.csv')

<div class="alert alert-block alert-danger">

# 4. General Health - Won't be using, too many missing values
   * Ages 1+
   * HSD010 - General health condition
       * Ages 12+
       * 2999 missing
       * Categorical 1-5
   * HSQ500 - SP have head cold or chest cold
       * Ages 0+
       * Missing 1001
       * 1 = Yes
       * 2 = No
       * 7, 9, NaN -> No
   * HSQ510 - SP have stomach or intestinal illness?
       * Ages 0+
       * Missing 1001
       * 1 = Yes
       * 2 = No
       * 7, 9, NaN -> No
   * HSQ520 - SP have flu, pneumonia, ear infection?
       * Ages 0+
       * Missing 1001
       * 1 = Yes
       * 2 = No
       * 7, 9, NaN -> No

In [92]:
general_health_columns = ['SEQN', 'HSD010', 'HSQ500', 'HSQ510', 'HSQ520']
General_Health_df = Preprocess_Q_General_Health_df[general_health_columns]
print ('General Health Shape: ' + str(General_Health_df.shape))
General_Health_df.head()

General Health Shape: (9165, 5)


Unnamed: 0,SEQN,HSD010,HSQ500,HSQ510,HSQ520
0,83732.0,3.0,2.0,2.0,2.0
1,83733.0,2.0,2.0,2.0,2.0
2,83734.0,4.0,2.0,1.0,2.0
3,83735.0,3.0,2.0,2.0,2.0
4,83736.0,4.0,2.0,2.0,2.0


In [107]:
General_Health_df = pd.merge(General_Health_df, age_df, on="SEQN")
General_Health_df = General_Health_df.drop(General_Health_df[General_Health_df.Age < 18].index)
print ('General Health Shape: ' + str(General_Health_df.shape))
General_Health_df.head()

General Health Shape: (5735, 6)


Unnamed: 0,SEQN,HSD010,HSQ500,HSQ510,HSQ520,Age
0,83732.0,3.0,2.0,2.0,2.0,62.0
1,83733.0,2.0,2.0,2.0,2.0,53.0
2,83734.0,4.0,2.0,1.0,2.0,78.0
3,83735.0,3.0,2.0,2.0,2.0,56.0
4,83736.0,4.0,2.0,2.0,2.0,42.0


In [109]:
print (General_Health_df['HSD010'].isnull().sum(axis = 0))
print (General_Health_df['HSQ500'].isnull().sum(axis = 0))
print (General_Health_df['HSQ510'].isnull().sum(axis = 0))
print (General_Health_df['HSQ520'].isnull().sum(axis = 0))

521
521
521
521


##### Too many missing data points

<div class="alert alert-block alert-success">

# 5. Diet Behavior and Nutrition

* Game Plan
   * DBQ700 - How healthy is the diet
       * Categorical 1-5
       * 9 -> 3 (mode)
   * DBQ197 - Past 30 day milk product consumption
       * Categorical 0-4
       * < 1  --> 0
   * DBD895 - # of meals not home prepared in the past week
       * Range 0-21
       * 5555 = More than 21
       * 9999 = Don't know
       * less than 1  -> 0
       * 5555 -> 22
       * 9999 -> 0 (mode)
   * DBD900 - # of meals from fast food or pizza place
       * Range 0-21
       * 5555 = More than 21
       * 9999 = Don't know
       * Missing 1387
       * __DROPPING__
   * DBD905 - # of ready-to-eat foods in past 30 days
       * Range 0-90
       * 6666 = More than 90
       * 7777, 9999 = Don't know
       * NaN -> 0
       * 7777 -> 0
       * 9999 -> 1
       * less than 1 -> 0
   * DBD910 - # of frozen meals/pizza in past 30 days
       * Range 0-90
       * 6666 = More than 90
       * 7777, 9999 = Don't know
       * NaN -> 0
       * 7777 -> 0
       * 9999 -> 1
       * less than 1 -> 0

In [93]:
diet_behavior_nutrition_columns = ['SEQN', 'DBQ700', 'DBQ197', 'DBD895', 'DBD900', 'DBD905', 'DBD910']
Diet_Behavior_and_Nutrition_df = Preprocess_Q_Diet_Behavior_and_Nutrition_df[diet_behavior_nutrition_columns]
print ('Diet Behavior and Nutrition Shape: ' + str(Diet_Behavior_and_Nutrition_df.shape))
Diet_Behavior_and_Nutrition_df.head()

Diet Behavior and Nutrition Shape: (9971, 7)


Unnamed: 0,SEQN,DBQ700,DBQ197,DBD895,DBD900,DBD905,DBD910
0,83732.0,3.0,3.0,1.0,5.397605e-79,2.0,5.397605e-79
1,83733.0,1.0,5.397605e-79,5.397605e-79,,5.397605e-79,5.397605e-79
2,83734.0,4.0,2.0,4.0,4.0,5.397605e-79,5.397605e-79
3,83735.0,4.0,5.397605e-79,3.0,1.0,11.0,15.0
4,83736.0,5.0,5.397605e-79,3.0,1.0,3.0,1.0


In [110]:
Diet_Behavior_and_Nutrition_df = pd.merge(Diet_Behavior_and_Nutrition_df, age_df, on="SEQN")
Diet_Behavior_and_Nutrition_df = Diet_Behavior_and_Nutrition_df.drop(Diet_Behavior_and_Nutrition_df[Diet_Behavior_and_Nutrition_df.Age < 18].index)
print ('Diet, Behavior, and Nutrition Shape: ' + str(Diet_Behavior_and_Nutrition_df.shape))
Diet_Behavior_and_Nutrition_df.head()


Diet, Behavior, and Nutrition Shape: (5992, 8)


Unnamed: 0,SEQN,DBQ700,DBQ197,DBD895,DBD900,DBD905,DBD910,Age
0,83732.0,3.0,3.0,1.0,5.397605e-79,2.0,5.397605e-79,62.0
1,83733.0,1.0,5.397605e-79,5.397605e-79,,5.397605e-79,5.397605e-79,53.0
2,83734.0,4.0,2.0,4.0,4.0,5.397605e-79,5.397605e-79,78.0
3,83735.0,4.0,5.397605e-79,3.0,1.0,11.0,15.0,56.0
4,83736.0,5.0,5.397605e-79,3.0,1.0,3.0,1.0,42.0


In [130]:
# print (Diet_Behavior_and_Nutrition_df['DBQ700'].isnull().sum(axis = 0))
# print (Diet_Behavior_and_Nutrition_df['DBQ197'].isnull().sum(axis = 0))
# print (Diet_Behavior_and_Nutrition_df['DBD895'].isnull().sum(axis = 0))
# print (Diet_Behavior_and_Nutrition_df['DBD900'].isnull().sum(axis = 0))
# print (Diet_Behavior_and_Nutrition_df['DBD905'].isnull().sum(axis = 0))
print (Diet_Behavior_and_Nutrition_df['DBD910'].isnull().sum(axis = 0))



8


In [134]:
# DBQ700
Diet_Behavior_and_Nutrition_df.loc[Diet_Behavior_and_Nutrition_df['DBQ700'] == 9, 'DBQ700'] = 3

# DBQ197
Diet_Behavior_and_Nutrition_df['DBQ197'].values[Diet_Behavior_and_Nutrition_df['DBQ197'].values < 1] = 0

# DBD895
Diet_Behavior_and_Nutrition_df['DBD895'].values[Diet_Behavior_and_Nutrition_df['DBD895'].values < 1] = 0
Diet_Behavior_and_Nutrition_df.loc[Diet_Behavior_and_Nutrition_df['DBD895'] == 5555, 'DBD895'] = 22
Diet_Behavior_and_Nutrition_df.loc[Diet_Behavior_and_Nutrition_df['DBD895'] == 9999, 'DBD895'] = 0

# DBD900
#Diet_Behavior_and_Nutrition_df = Diet_Behavior_and_Nutrition_df.drop('DBD900', axis = 1)

# DBD905
Diet_Behavior_and_Nutrition_df['DBD905'].values[Diet_Behavior_and_Nutrition_df['DBD905'].values < 1] = 0
Diet_Behavior_and_Nutrition_df.loc[Diet_Behavior_and_Nutrition_df['DBD905'] == 6666, 'DBD905'] = 95
Diet_Behavior_and_Nutrition_df.loc[Diet_Behavior_and_Nutrition_df['DBD905'] == 7777, 'DBD905'] = 0
Diet_Behavior_and_Nutrition_df.loc[Diet_Behavior_and_Nutrition_df['DBD905'] == 9999, 'DBD905'] = 1
Diet_Behavior_and_Nutrition_df['DBD905'].fillna(0, inplace=True)

# DBD910
Diet_Behavior_and_Nutrition_df['DBD910'].values[Diet_Behavior_and_Nutrition_df['DBD910'].values < 1] = 0
Diet_Behavior_and_Nutrition_df.loc[Diet_Behavior_and_Nutrition_df['DBD910'] == 6666, 'DBD910'] = 95
Diet_Behavior_and_Nutrition_df.loc[Diet_Behavior_and_Nutrition_df['DBD910'] == 7777, 'DBD910'] = 0
Diet_Behavior_and_Nutrition_df.loc[Diet_Behavior_and_Nutrition_df['DBD910'] == 9999, 'DBD910'] = 1
Diet_Behavior_and_Nutrition_df['DBD910'].fillna(0, inplace=True)


In [136]:
# Diet_Behavior_and_Nutrition_df['DBQ700'].value_counts()
# Diet_Behavior_and_Nutrition_df['DBQ197'].value_counts()
# Diet_Behavior_and_Nutrition_df['DBD895'].value_counts()
# Diet_Behavior_and_Nutrition_df['DBD900'].value_counts()
# Diet_Behavior_and_Nutrition_df['DBD905'].value_counts()
# Diet_Behavior_and_Nutrition_df['DBD910'].value_counts()

In [138]:
Diet_Behavior_and_Nutrition_df.head()

Unnamed: 0,SEQN,DBQ700,DBQ197,DBD895,DBD905,DBD910,Age
0,83732.0,3.0,3.0,1.0,2.0,0.0,62.0
1,83733.0,1.0,0.0,0.0,0.0,0.0,53.0
2,83734.0,4.0,2.0,4.0,0.0,0.0,78.0
3,83735.0,4.0,0.0,3.0,11.0,15.0,56.0
4,83736.0,5.0,0.0,3.0,3.0,1.0,42.0


In [139]:
# Dummies
need_dummies = ['DBQ700', 'DBQ197']
Diet_Behavior_and_Nutrition_df = pd.get_dummies(Diet_Behavior_and_Nutrition_df, columns = need_dummies, drop_first = True)


In [141]:
Diet_Behavior_and_Nutrition_df.head()

Unnamed: 0,SEQN,DBD895,DBD905,DBD910,Age,DBQ700_2.0,DBQ700_3.0,DBQ700_4.0,DBQ700_5.0,DBQ197_1.0,DBQ197_2.0,DBQ197_3.0,DBQ197_4.0
0,83732.0,1.0,2.0,0.0,62.0,0,1,0,0,0,0,1,0
1,83733.0,0.0,0.0,0.0,53.0,0,0,0,0,0,0,0,0
2,83734.0,4.0,0.0,0.0,78.0,0,0,1,0,0,1,0,0
3,83735.0,3.0,11.0,15.0,56.0,0,0,1,0,0,0,0,0
4,83736.0,3.0,3.0,1.0,42.0,0,0,0,1,0,0,0,0


In [142]:
Final_Diet_Behavior_and_Nutrition_df = Diet_Behavior_and_Nutrition_df.drop('Age', axis = 1)

In [144]:
Final_Diet_Behavior_and_Nutrition_df.to_csv('Cleaned_Diet_Behavior.csv')

<div class="alert alert-block alert-success">

# 6. Disabilities
   * Ages 1+
   * DLQ050 - Have serious difficulty walking?
       * 1 = Yes
       * 2 = No
       * 9 = Don't know
       * Game Plan:
           * 9 -> 0
           * 2 -> 0
           * Get dummies
   * Most of the data is depression related with 3600+ missing values 

In [145]:
disabilities_columns = ['SEQN', 'DLQ050']
Disabilities_df = Preprocess_Q_Disabilities_df[disabilities_columns]
print ('Disabilities Shape: ' + str(Disabilities_df.shape))
Disabilities_df.head()

Disabilities Shape: (9575, 2)


Unnamed: 0,SEQN,DLQ050
0,83732.0,2.0
1,83733.0,2.0
2,83734.0,2.0
3,83735.0,2.0
4,83736.0,1.0


In [146]:
Disabilities_df = pd.merge(Disabilities_df, age_df, on="SEQN")
Disabilities_df = Disabilities_df.drop(Disabilities_df[Disabilities_df.Age < 18].index)
print ('Disabilities Shape: ' + str(Disabilities_df.shape))
Disabilities_df.head()


Disabilities Shape: (5992, 3)


Unnamed: 0,SEQN,DLQ050,Age
0,83732.0,2.0,62.0
1,83733.0,2.0,53.0
2,83734.0,2.0,78.0
3,83735.0,2.0,56.0
4,83736.0,1.0,42.0


In [147]:
print (Disabilities_df['DLQ050'].isnull().sum(axis = 0))

0


In [150]:
Disabilities_df['DLQ050'].value_counts()

0.0    5110
1.0     882
Name: DLQ050, dtype: int64

In [149]:
Disabilities_df.loc[Disabilities_df['DLQ050'] == 2, 'DLQ050'] = 0
Disabilities_df.loc[Disabilities_df['DLQ050'] == 9, 'DLQ050'] = 0


In [151]:
# Dummies
need_dummies = ['DLQ050']
Disabilities_df = pd.get_dummies(Disabilities_df, columns = need_dummies, drop_first = True)


In [152]:
Disabilities_df.head()

Unnamed: 0,SEQN,Age,DLQ050_1.0
0,83732.0,62.0,0
1,83733.0,53.0,0
2,83734.0,78.0,0
3,83735.0,56.0,0
4,83736.0,42.0,1


In [153]:
Final_Disabilities_df = Disabilities_df.drop('Age', axis = 1)

In [155]:
Final_Disabilities_df.to_csv('Cleaned_Disabilities.csv')

<div class="alert alert-block alert-success">

# 7. Health Insurance
   * HIQ011 - Covered by health insurance
       * 1 = Yes
       * 2 = No
       * 7 = Refuse
       * 9 = Don't know
       * Game Plan:
           * 2, 7, 9 -> 0
           * Dummies

In [95]:
health_insurance_columns = ['SEQN', 'HIQ011']
Health_Insurance_df = Preprocess_Q_Health_Insurance_df[health_insurance_columns]
print ('Health Insurance Shape: ' + str(Health_Insurance_df.shape))
Health_Insurance_df.head()

Health Insurance Shape: (9971, 2)


Unnamed: 0,SEQN,HIQ011
0,83732.0,1.0
1,83733.0,2.0
2,83734.0,1.0
3,83735.0,1.0
4,83736.0,1.0


In [156]:
Health_Insurance_df = pd.merge(Health_Insurance_df, age_df, on="SEQN")
Health_Insurance_df = Health_Insurance_df.drop(Health_Insurance_df[Health_Insurance_df.Age < 18].index)
print ('Health Insurcance Shape: ' + str(Health_Insurance_df.shape))
Health_Insurance_df.head()

Health Insurcance Shape: (5992, 3)


Unnamed: 0,SEQN,HIQ011,Age
0,83732.0,1.0,62.0
1,83733.0,2.0,53.0
2,83734.0,1.0,78.0
3,83735.0,1.0,56.0
4,83736.0,1.0,42.0


In [157]:
print (Health_Insurance_df['HIQ011'].isnull().sum(axis = 0))

0


In [160]:
Health_Insurance_df['HIQ011'].value_counts()

1.0    4935
0.0    1057
Name: HIQ011, dtype: int64

In [159]:
Health_Insurance_df.loc[Health_Insurance_df['HIQ011'] == 2, 'HIQ011'] = 0
Health_Insurance_df.loc[Health_Insurance_df['HIQ011'] == 7, 'HIQ011'] = 0
Health_Insurance_df.loc[Health_Insurance_df['HIQ011'] == 9, 'HIQ011'] = 0


In [161]:
# Dummies

need_dummies = ['HIQ011']
Health_Insurance_df = pd.get_dummies(Health_Insurance_df, columns = need_dummies, drop_first = True)

In [163]:
Final_Health_Insurance_df = Health_Insurance_df.drop('Age', axis = 1)

In [165]:
Final_Health_Insurance_df.to_csv('Cleaned_Health_Insurance.csv')

<div class="alert alert-block alert-success">

# 8. Access to Care
   * HUQ020 - Health now compared with 1 year ago
       * Categorical
       * 1 = Better
       * 2 = Worse
       * 3 = About the same
       * 9 = Don't know
       * Game plan:
           * 9 -> 3 (mode, didn't change)
           * Get Dummies
   * HUQ051 - #times receive healthcare over past year
       * Categorical: 0-8
       * Game Plan:
           * less than 1 -> 0
           * 77, 99 -> 2 (mode)
       

In [96]:
access_to_care_columns = ['SEQN', 'HUQ020', 'HUQ051']
Access_to_Care_df = Preprocess_Q_Access_to_Care_df[access_to_care_columns]
print ('Access to Health Care Shape: ' + str(Access_to_Care_df.shape))
Access_to_Care_df.head()

Access to Health Care Shape: (9971, 3)


Unnamed: 0,SEQN,HUQ020,HUQ051
0,83732.0,2.0,5.0
1,83733.0,3.0,5.397605e-79
2,83734.0,3.0,2.0
3,83735.0,3.0,4.0
4,83736.0,2.0,2.0


In [166]:
Access_to_Care_df = pd.merge(Access_to_Care_df, age_df, on="SEQN")
Access_to_Care_df = Access_to_Care_df.drop(Access_to_Care_df[Access_to_Care_df.Age < 18].index)
print ('Access to Health Care Shape: ' + str(Access_to_Care_df.shape))
Access_to_Care_df.head()

Access to Health Care Shape: (5992, 4)


Unnamed: 0,SEQN,HUQ020,HUQ051,Age
0,83732.0,2.0,5.0,62.0
1,83733.0,3.0,5.397605e-79,53.0
2,83734.0,3.0,2.0,78.0
3,83735.0,3.0,4.0,56.0
4,83736.0,2.0,2.0,42.0


In [167]:
print (Access_to_Care_df['HUQ020'].isnull().sum(axis = 0))
print (Access_to_Care_df['HUQ051'].isnull().sum(axis = 0))

0
0


In [182]:
# Access_to_Care_df['HUQ020'].value_counts()
Access_to_Care_df['HUQ051'].value_counts()

2.0    1757
0.0    1058
1.0    1045
3.0     883
4.0     407
6.0     303
8.0     256
5.0     182
7.0     101
Name: HUQ051, dtype: int64

In [180]:
# Game Plan

# HUQ020
Access_to_Care_df.loc[Access_to_Care_df['HUQ020'] == 9, 'HUQ020'] = 3

# HUQ051
Access_to_Care_df.loc[Access_to_Care_df['HUQ051'] == 77, 'HUQ051'] = 2
Access_to_Care_df.loc[Access_to_Care_df['HUQ051'] == 99, 'HUQ051'] = 2
Access_to_Care_df['HUQ051'].values[Access_to_Care_df['HUQ051'].values < 1] = 0

In [181]:
Access_to_Care_df.head()

Unnamed: 0,SEQN,HUQ020,HUQ051,Age
0,83732.0,2.0,5.0,62.0
1,83733.0,3.0,0.0,53.0
2,83734.0,3.0,2.0,78.0
3,83735.0,3.0,4.0,56.0
4,83736.0,2.0,2.0,42.0


In [183]:
Access_to_Care_df = Access_to_Care_df.drop('Age', axis = 1)

In [184]:
# Dummies

need_dummies = ['HUQ020', 'HUQ051']
Access_to_Care_df = pd.get_dummies(Access_to_Care_df, columns = need_dummies, drop_first = True)

In [186]:
Final_Access_to_Care_df = Access_to_Care_df

In [187]:
Final_Access_to_Care_df.to_csv('Cleaned_Access_to_Care.csv')

<div class="alert alert-block alert-danger">

# 9. Housing Characteristics

   * HOQ065 - Home owned, bought, rented, other
       * Missing 329
       * 1 = Own
       * 2 = Rent
       * 3 = Other
       * 7, 9 = Refused / Don't know

   * HOD050 - Number of rooms in home
       * Missing 329
       * Categorical: 1-13
       * 777 = refused (34, 5 rooms is the largest category)
       
   * Not going to use
       * Doesn't seem like an important factor

In [97]:
housing_char_columns = ['SEQN', 'HOQ065', 'HOD050']
Housing_df = Preprocess_Q_Housing_df[housing_char_columns]
print ('Housing Shape: ' + str(Housing_df.shape))
Housing_df.head()

Housing Shape: (9971, 3)


Unnamed: 0,SEQN,HOQ065,HOD050
0,83732.0,1.0,7.0
1,83733.0,1.0,6.0
2,83734.0,1.0,5.0
3,83735.0,1.0,4.0
4,83736.0,2.0,5.0


<div class="alert alert-block alert-success">

# 10. Income

   * INQ132 - Income from state/county cash assistance
       * Categorical
       * 1 = Yes
       * 2 = No
       * Game Plan:
           * 2, 7, Nan -> 0
           * 7 -> 1

   * INDFMMPI - Family monthly poverty level index
       * __DROP__
           * too many missing values
   
   * INDFMMPC - Family monthly poverty level category
       * __DROP__
           * too many missing values
       
   * INQ320 - How do you get to the grocery store?
       * Categorical 1-9
       * Game Plan:
           * Nan -> 1 (mode)
           * 66, 77 -> 9 (other)

In [217]:
income_columns = ['SEQN', 'INQ132', 'INQ320']
Income_df = Preprocess_Q_Income_df[income_columns]
print ('Income Shape: ' + str(Income_df.shape))
Income_df.head()

Income Shape: (9971, 3)


Unnamed: 0,SEQN,INQ132,INQ320
0,83732.0,2.0,1.0
1,83733.0,2.0,1.0
2,83734.0,2.0,1.0
3,83735.0,2.0,1.0
4,83736.0,2.0,1.0


In [218]:
Income_df = pd.merge(Income_df, age_df, on="SEQN")
Income_df = Income_df.drop(Income_df[Income_df.Age < 18].index)
print ('Income Shape: ' + str(Income_df.shape))
Income_df = Income_df.drop('Age', axis = 1)
Income_df.head()

Income Shape: (5992, 4)


Unnamed: 0,SEQN,INQ132,INQ320
0,83732.0,2.0,1.0
1,83733.0,2.0,1.0
2,83734.0,2.0,1.0
3,83735.0,2.0,1.0
4,83736.0,2.0,1.0


In [224]:
print (Income_df['INQ132'].isnull().sum(axis = 0))
print (Income_df['INQ320'].isnull().sum(axis = 0))


0
0


In [223]:
# Income_df['INQ132'].value_counts()
Income_df['INQ320'].value_counts()

1.0    4977
2.0     284
3.0     268
4.0     244
6.0      96
7.0      39
9.0      35
8.0      27
5.0      22
Name: INQ320, dtype: int64

In [221]:
# Game Plan

# INQ132
Income_df.loc[Income_df['INQ132'] == 2, 'INQ132'] = 0
Income_df.loc[Income_df['INQ132'] == 7, 'INQ132'] = 1
Income_df.loc[Income_df['INQ132'] == 9, 'INQ132'] = 0
Income_df['INQ132'].fillna(0, inplace=True)

# INQ320
Income_df.loc[Income_df['INQ320'] == 66, 'INQ320'] = 9
Income_df.loc[Income_df['INQ320'] == 77, 'INQ320'] = 9
Income_df['INQ320'].fillna(1, inplace=True)


In [226]:
# Dummies

need_dummies = ['INQ132', 'INQ320']
Income_df = pd.get_dummies(Income_df, columns = need_dummies, drop_first = True)

In [228]:
Final_Income_df = Income_df

In [229]:
Final_Income_df.to_csv('Cleaned_Income.csv')

<div class="alert alert-block alert-success">

# 11. Medical Conditions

   * MCQ010 - Ever been told you have asthma
       * Categorical
       * 1 = Yes
       * 2 = No
       * Game Plan:
           * 2,9 -> 0

   * MCQ080 - Doctor ever said you were overweight
       * Categorical
       * 1 = Yes
       * 2 = No
       * Game plan:
           * 2,9 -> 0
   
   * MCQ220 - Ever told you had cancer or malignancy
       * 1 = Yes
       * 2 = No
       * * Game plan:
           * 2,9, Nan -> 0


In [99]:
medical_conditions_columns = ['SEQN', 'MCQ010', 'MCQ080', 'MCQ220']
Medical_Conditions_df = Preprocess_Q_Medical_Conditions_df[medical_conditions_columns]
print ('Medical Conditions Shape: ' + str(Medical_Conditions_df.shape))
Medical_Conditions_df.head()

Medical Conditions Shape: (9575, 4)


Unnamed: 0,SEQN,MCQ010,MCQ080,MCQ220
0,83732.0,2.0,1.0,1.0
1,83733.0,2.0,2.0,2.0
2,83734.0,1.0,1.0,1.0
3,83735.0,2.0,1.0,2.0
4,83736.0,1.0,2.0,2.0


In [230]:
Medical_Conditions_df = pd.merge(Medical_Conditions_df, age_df, on="SEQN")
Medical_Conditions_df = Medical_Conditions_df.drop(Medical_Conditions_df[Medical_Conditions_df.Age < 18].index)
print ('Medical Conditions Shape: ' + str(Medical_Conditions_df.shape))
Medical_Conditions_df = Medical_Conditions_df.drop('Age', axis = 1)
Medical_Conditions_df.head()

Medical Conditions Shape: (5992, 5)


Unnamed: 0,SEQN,MCQ010,MCQ080,MCQ220
0,83732.0,2.0,1.0,1.0
1,83733.0,2.0,2.0,2.0
2,83734.0,1.0,1.0,1.0
3,83735.0,2.0,1.0,2.0
4,83736.0,1.0,2.0,2.0


In [237]:
print (Medical_Conditions_df['MCQ010'].isnull().sum(axis = 0))
print (Medical_Conditions_df['MCQ080'].isnull().sum(axis = 0))
print (Medical_Conditions_df['MCQ220'].isnull().sum(axis = 0))

0
0
0


In [240]:
# Medical_Conditions_df['MCQ010'].value_counts()
# Medical_Conditions_df['MCQ010'].value_counts()
# Medical_Conditions_df['MCQ010'].value_counts()

0.0    5081
1.0     911
Name: MCQ010, dtype: int64

In [235]:
# Game Plan

# MCQ010
Medical_Conditions_df.loc[Medical_Conditions_df['MCQ010'] == 2, 'MCQ010'] = 0
Medical_Conditions_df.loc[Medical_Conditions_df['MCQ010'] == 9, 'MCQ010'] = 0

# MCQ080
Medical_Conditions_df.loc[Medical_Conditions_df['MCQ080'] == 2, 'MCQ080'] = 0
Medical_Conditions_df.loc[Medical_Conditions_df['MCQ080'] == 9, 'MCQ080'] = 0


# MCQ220
Medical_Conditions_df.loc[Medical_Conditions_df['MCQ220'] == 2, 'MCQ220'] = 0
Medical_Conditions_df.loc[Medical_Conditions_df['MCQ220'] == 9, 'MCQ220'] = 0
Medical_Conditions_df['MCQ220'].fillna(0, inplace=True)


In [241]:
# Dummies

need_dummies = ['MCQ010', 'MCQ080', 'MCQ220']
Medical_Conditions_df = pd.get_dummies(Medical_Conditions_df, columns = need_dummies, drop_first = True)

In [243]:
Final_Medical_Conditions_df = Medical_Conditions_df

In [244]:
Final_Medical_Conditions_df.to_csv('Cleaned_Medical_Conditions.csv')

<div class="alert alert-block alert-success">

# 12. Mental Health
   * Ages 18+
   * DPQ030 - Trouble sleeping or sleeping too much
       * Categorical 0-3
       * Game Plan:
           * Nan -> 0
           * 7 -> 2
           * 9 -> 1
           * less than 1 -> 0

   * DPQ040 - Feeling tired or having little energy
       * Categorical 0-3
       * Game Plan:
           * Nan -> 0
           * 7   -> 2
           * less than 1 -> 0
           
   * DPQ050 - Poor appetite or overeating
       * Categorical 0-3
       * Game Plan:
           * Nan -> 0
           * 9   -> 1
           * less than 1 -> 0


In [246]:
mental_health_columns = ['SEQN', 'DPQ030', 'DPQ040', 'DPQ050']
Mental_Health_df = Preprocess_Q_Mental_Health_df[mental_health_columns]
print ('Mental Health Shape: ' + str(Mental_Health_df.shape))
df = Mental_Health_df
df.head()

Mental Health Shape: (5735, 4)


Unnamed: 0,SEQN,DPQ030,DPQ040,DPQ050
0,83732.0,5.397605e-79,1.0,5.397605e-79
1,83733.0,5.397605e-79,5.397605e-79,1.0
2,83734.0,5.397605e-79,1.0,5.397605e-79
3,83735.0,2.0,2.0,1.0
4,83736.0,1.0,1.0,3.0


In [247]:
df = pd.merge(df, age_df, on="SEQN")
df = df.drop(df[df.Age < 18].index)
print ('Mental Health Shape: ' + str(df.shape))
df = df.drop('Age', axis = 1)
df.head()

Mental Health Shape: (5735, 5)


Unnamed: 0,SEQN,DPQ030,DPQ040,DPQ050
0,83732.0,5.397605e-79,1.0,5.397605e-79
1,83733.0,5.397605e-79,5.397605e-79,1.0
2,83734.0,5.397605e-79,1.0,5.397605e-79
3,83735.0,2.0,2.0,1.0
4,83736.0,1.0,1.0,3.0


In [253]:
print (df['DPQ030'].isnull().sum(axis = 0))
print (df['DPQ040'].isnull().sum(axis = 0))
print (df['DPQ050'].isnull().sum(axis = 0))


0
0
0


In [256]:
df['DPQ030'].value_counts()
#df['DPQ040'].value_counts()
#df['DPQ050'].value_counts()

0.0    3763
1.0    1197
3.0     419
2.0     356
Name: DPQ030, dtype: int64

In [252]:
col = 'DPQ030'
df[col].values[df[col].values < 1] = 0
df.loc[df[col] == 7, col] = 2
df.loc[df[col] == 9, col] = 1
df[col].fillna(0, inplace=True)

col = 'DPQ040'
df[col].values[df[col].values < 1] = 0
df.loc[df[col] == 7, col] = 2
df[col].fillna(0, inplace=True)

col = 'DPQ050'
df[col].values[df[col].values < 1] = 0
df.loc[df[col] == 9, col] = 1
df[col].fillna(0, inplace=True)

In [257]:
# Dummies

need_dummies = ['DPQ030', 'DPQ040', 'DPQ050']
df = pd.get_dummies(df, columns = need_dummies, drop_first = True)

In [258]:
df.head()

Unnamed: 0,SEQN,DPQ030_1.0,DPQ030_2.0,DPQ030_3.0,DPQ040_1.0,DPQ040_2.0,DPQ040_3.0,DPQ050_1.0,DPQ050_2.0,DPQ050_3.0
0,83732.0,0,0,0,1,0,0,0,0,0
1,83733.0,0,0,0,0,0,0,1,0,0
2,83734.0,0,0,0,1,0,0,0,0,0
3,83735.0,0,1,0,0,1,0,1,0,0
4,83736.0,1,0,0,1,0,0,0,0,1


In [259]:
Final_Mental_Health_df = df

In [261]:
Final_Mental_Health_df.to_csv('Cleaned_Mental_Health.csv')

<div class="alert alert-block alert-success">

# 13. Physical Activity

   * PAQ635 - Walk or bicycle to get places, at least 10 mins
       * Categorical
       * 1 = Yes
       * 2 = No
       * Game Plan:
           * 2, 9, NaN -> 0

   * PAQ640 - Number of days walk or bicycle
       * Range 1-7
       * Game Plan:
           * NaN -> 0
           * 99 -> 1
   
   * PAD645 - Minutes walk/bicycle for transportation
       * Range 10-1200
       * Game Plan:
           * 9999 -> Median
           * NaN -> 0
            
   * PAQ650 - Vigorous recreational activities
       * Categorical
       * 1 = Yes
       * 2 = No
       * Game Plan:
           * 2, NaN -> 0
           * 9 -> 1
   
   * PAQ655 - Days vigorous recreational activities
       * Range 1-7
       * Game Plan:
           * NaN -> 0
   
   * PAD660 - Minutes vigorous recreational activities
       * Range 10-480
       * Game Plan:
           * 9999, NaN -> 0
   
   * PAQ665 - Moderate recreational activities
       * Categorical
       * 1 = Yes
       * 2 = No
       * Game Plan:
           * 2, 9, NaN -> 0
   
   * PAQ670 - Days moderate recreational activities
       * Range 1-7
       * Game Plan:
           * 99, NaN -> 0
   
   * PAD675 - Minutes moderate recreational activities
       * Range 10-600
       * Game Plan:
           * 9999, NaN -> 0
       
   * PAD680 - Minutes sedentary activity
       * Range
       * Game Plan:
           * 7777, 9999 -> Nan
           * NaN -> Median
   
   * PAQ710 - Hours watch TV or videos past 30 days
       * Range 1-7
       * Game Plan:
           * 77 -> 5 (why would you refuse)
           * 99, NaN -> 2 (mode)
   
   * PAQ715 - Hours use computer past 30 days
       * Range 0-8
       * Game Plan:
           * NaN -> 0
           * 99 -> 1

In [262]:
physical_avtivity_columns = ['SEQN', 'PAQ635', 'PAQ640', 'PAD645', 'PAQ650', 'PAQ655', 'PAD660', 'PAQ665', 'PAQ670',
                            'PAD675', 'PAD680', 'PAQ710', 'PAQ715']
Physical_Activity_df = Preprocess_Q_Physical_Activity_df[physical_avtivity_columns]
print ('Physical Shape: ' + str(Physical_Activity_df.shape))
df = Physical_Activity_df
df.head()

Physical Shape: (9255, 13)


Unnamed: 0,SEQN,PAQ635,PAQ640,PAD645,PAQ650,PAQ655,PAD660,PAQ665,PAQ670,PAD675,PAD680,PAQ710,PAQ715
0,83732.0,2.0,,,2.0,,,1.0,6.0,30.0,480.0,5.0,5.397605e-79
1,83733.0,2.0,,,2.0,,,2.0,,,300.0,5.0,5.397605e-79
2,83734.0,2.0,,,2.0,,,2.0,,,480.0,5.0,8.0
3,83735.0,2.0,,,2.0,,,2.0,,,480.0,3.0,2.0
4,83736.0,2.0,,,2.0,,,2.0,,,540.0,4.0,5.0


In [263]:
df = pd.merge(df, age_df, on="SEQN")
df = df.drop(df[df.Age < 18].index)
print ('Physical Activity Shape: ' + str(df.shape))
df = df.drop('Age', axis = 1)
df.head()

Physical Activity Shape: (5992, 14)


Unnamed: 0,SEQN,PAQ635,PAQ640,PAD645,PAQ650,PAQ655,PAD660,PAQ665,PAQ670,PAD675,PAD680,PAQ710,PAQ715
0,83732.0,2.0,,,2.0,,,1.0,6.0,30.0,480.0,5.0,5.397605e-79
1,83733.0,2.0,,,2.0,,,2.0,,,300.0,5.0,5.397605e-79
2,83734.0,2.0,,,2.0,,,2.0,,,480.0,5.0,8.0
3,83735.0,2.0,,,2.0,,,2.0,,,480.0,3.0,2.0
4,83736.0,2.0,,,2.0,,,2.0,,,540.0,4.0,5.0


In [285]:
# print (df['PAQ635'].isnull().sum(axis = 0))
# print (df['PAQ640'].isnull().sum(axis = 0))
# print (df['PAD645'].isnull().sum(axis = 0))
# print (df['PAQ650'].isnull().sum(axis = 0))
# print (df['PAQ655'].isnull().sum(axis = 0))
# print (df['PAD660'].isnull().sum(axis = 0))
# print (df['PAQ665'].isnull().sum(axis = 0))
# print (df['PAQ670'].isnull().sum(axis = 0))
# print (df['PAD675'].isnull().sum(axis = 0))
# print (df['PAD680'].isnull().sum(axis = 0))
# print (df['PAQ710'].isnull().sum(axis = 0))
print (df['PAQ715'].isnull().sum(axis = 0))

0


In [286]:
df['PAQ715'].value_counts()

8.0     2076
0.0     1226
1.0      910
2.0      791
5.0      398
3.0      357
4.0      231
99.0       3
Name: PAQ715, dtype: int64

In [287]:
# Game Plan

col = 'PAQ635'
df.loc[df[col] == 2, col] = 95
df.loc[df[col] == 9, col] = 0
df[col].fillna(0, inplace=True)


col = 'PAQ640'
df.loc[df[col] == 99, col] = 1
df[col].fillna(0, inplace=True)


col = 'PAD645'
df.loc[df[col] == 9999, col] = 30
df[col].fillna(0, inplace=True)


col = 'PAQ650'
df.loc[df[col] == 2, col] = 0
df.loc[df[col] == 9, col] = 1
df[col].fillna(0, inplace=True)


col = 'PAQ665'

df.loc[df[col] == 9, col] = 0
df.loc[df[col] == 2, col] = 0


col = 'PAQ670'
df.loc[df[col] == 99, col] = 0


col = 'PAQ710'
df.loc[df[col] == 77, col] = 5
df.loc[df[col] == 99, col] = 2


col = 'PAQ715'
df.loc[df[col] == 99, col] = 1


In [291]:
# Dummies

need_dummies = ['PAQ635', 'PAQ650', 'PAQ665', 'PAQ710', 'PAQ715']
df = pd.get_dummies(df, columns = need_dummies, drop_first = True)

In [294]:
Final_Physical_Activities = df

In [295]:
Final_Physical_Activities.to_csv('Cleaned_Physical_Activities.csv')

<div class="alert alert-block alert-success">

# 14. Sleep

   * SLQ310 - Usual wake time on weekdays or workdays
       * Range
       * Game Plan:
           * 99999 -> Nan -> mode
   
   * SLD012 - Sleep hours
       * Range 2 - 14.5
       * Game Plan:
           * Nan -> median
       
   * SLQ030 - How often do you snore?
       * Categorical 0-3
       * Game Plan:
           * Less than 1 -> 0
           * 7 -> 3
           * 9 stays as it's own category
       
   * SLQ120 - How often feel overly sleepy during day?
       * Categorical 0-4
       * Game Plan:
           * Less than 1 -> 0
           * 9 -> 2 (mode)


In [370]:
sleep_columns = ['SEQN', 'SLQ310', 'SLD012', 'SLQ030', 'SLQ120']
Sleep_df = Preprocess_Q_Sleep_df[sleep_columns]
print ('Sleep Shape: ' + str(Sleep_df.shape))
df = Sleep_df
df.head()

Sleep Shape: (6327, 5)


Unnamed: 0,SEQN,SLQ310,SLD012,SLQ030,SLQ120
0,83732.0,b'05:00',5.5,2.0,3.0
1,83733.0,b'07:00',8.0,1.0,5.397605e-79
2,83734.0,b'05:30',7.0,5.397605e-79,3.0
3,83735.0,b'06:00',6.5,9.0,4.0
4,83736.0,b'06:00',,9.0,1.0


In [371]:
df = pd.merge(df, age_df, on="SEQN")
df = df.drop(df[df.Age < 18].index)
print ('Sleep Shape: ' + str(df.shape))
df = df.drop('Age', axis = 1)
df.head()

Sleep Shape: (5992, 6)


Unnamed: 0,SEQN,SLQ310,SLD012,SLQ030,SLQ120
0,83732.0,b'05:00',5.5,2.0,3.0
1,83733.0,b'07:00',8.0,1.0,5.397605e-79
2,83734.0,b'05:30',7.0,5.397605e-79,3.0
3,83735.0,b'06:00',6.5,9.0,4.0
4,83736.0,b'06:00',,9.0,1.0


In [380]:
print (df['SLQ310'].isnull().sum(axis = 0))
print (df['SLD012'].isnull().sum(axis = 0))
print (df['SLQ030'].isnull().sum(axis = 0))
print (df['SLQ120'].isnull().sum(axis = 0))



0
33
0
0


In [379]:
# df['SLQ310'].value_counts()
# df['SLD012'].value_counts()
# df['SLQ030'].value_counts()
# df['SLQ120'].value_counts()

In [374]:
col = 'SLQ310'
df[col].values[df[col].values == b'99999'] = str(b'06:00')
df['SLQ310'] = df['SLQ310'].apply(lambda x: str(x))

In [375]:
def mins_after_midnight(time):
    split = list(time)
    if len(split) < 5:
        return 0
    hours = int(''.join(split[2:4]))
    mins = int(''.join(split[5:7]))
    military = hours*60 + mins
    
    return military

df['SLQ310'] = df['SLQ310'].apply(lambda x: mins_after_midnight(x))

In [376]:
df['SLQ310'] = df['SLQ310'].apply(lambda x: mins_after_midnight(x))

In [381]:
# Game Plan
col = 'SLD012'
df[col] = df[col].fillna(df[col].median())

col = 'SLQ030'
df[col].values[df[col].values < 1] = 0
df.loc[df[col] == 7, col] = 3

col = 'SLQ120'
df[col].values[df[col].values < 1] = 0
df.loc[df[col] == 9, col] = 2


In [384]:
df.describe()

Unnamed: 0,SEQN,SLQ310,SLD012,SLQ030,SLQ120
count,5992.0,5992.0,5992.0,5992.0,5992.0
mean,88676.950935,403.55741,7.742824,1.988818,1.739653
std,2877.819691,131.484205,1.569494,2.27658,1.188456
min,83732.0,0.0,2.0,0.0,0.0
25%,86175.75,330.0,7.0,0.0,1.0
50%,88659.5,390.0,8.0,1.0,2.0
75%,91174.25,450.0,8.5,3.0,3.0
max,93702.0,1380.0,14.5,9.0,4.0


In [387]:
# Dummies

need_dummies = ['SLQ030', 'SLQ120']
df = pd.get_dummies(df, columns = need_dummies, drop_first = True)

In [388]:
df.head()

Unnamed: 0,SEQN,SLQ310,SLD012,SLQ030_1.0,SLQ030_2.0,SLQ030_3.0,SLQ030_9.0,SLQ120_1.0,SLQ120_2.0,SLQ120_3.0,SLQ120_4.0
0,83732.0,300,5.5,0,1,0,0,0,0,1,0
1,83733.0,420,8.0,1,0,0,0,0,0,0,0
2,83734.0,330,7.0,0,0,0,0,0,0,1,0
3,83735.0,360,6.5,0,0,0,1,0,0,0,1
4,83736.0,360,8.0,0,0,0,1,1,0,0,0


In [389]:
Final_Sleep_df = df

In [390]:
Final_Sleep_df.to_csv('Cleaned_Sleep.csv')

<div class="alert alert-block alert-success">

# 15. Smoking

   * SMQ040 - Do you now smoke cigarettes?
       * Categorical
       * 1 = Everyday
       * 2 = Somedays
       * 3 = Not at all
       * Game Plan:
           * NaN -> 3

   * SMQ020 - Smoked at least 100 cigarettes in life
       * Categorical
       * 1 = Yes
       * 2 = No
       * Game Plan:
           * 7, 9 -> 1
           * 2 -> 0


In [391]:
smoking_columns = ['SEQN', 'SMQ040', 'SMQ020']
Smoking_df = Preprocess_Q_Smoking_df[smoking_columns]
print ('Smoking Shape: ' + str(Smoking_df.shape))
df = Smoking_df
df.head()

Smoking Shape: (7001, 3)


Unnamed: 0,SEQN,SMQ040,SMQ020
0,83732.0,3.0,1.0
1,83733.0,1.0,1.0
2,83734.0,3.0,1.0
3,83735.0,,2.0
4,83736.0,,2.0


In [392]:
df = pd.merge(df, age_df, on="SEQN")
df = df.drop(df[df.Age < 18].index)
print ('Smoking Shape: ' + str(df.shape))
df = df.drop('Age', axis = 1)
df.head()

Smoking Shape: (5992, 4)


Unnamed: 0,SEQN,SMQ040,SMQ020
0,83732.0,3.0,1.0
1,83733.0,1.0,1.0
2,83734.0,3.0,1.0
3,83735.0,,2.0
4,83736.0,,2.0


In [393]:
print (df['SMQ040'].isnull().sum(axis = 0))
print (df['SMQ020'].isnull().sum(axis = 0))


3570
0


In [399]:
df['SMQ040'].value_counts()
# df['SMQ020'].value_counts()

3.0    4892
1.0     832
2.0     268
Name: SMQ040, dtype: int64

In [396]:
# Game Plan

col = 'SMQ040'
df[col].fillna(3, inplace=True)


col = 'SMQ020'
df.loc[df[col] == 7, col] = 1
df.loc[df[col] == 9, col] = 1
df.loc[df[col] == 2, col] = 0

In [400]:
# Dummies

need_dummies = ['SMQ040', 'SMQ020']
df = pd.get_dummies(df, columns = need_dummies, drop_first = True)

In [402]:
Final_Smoking_df = df

In [403]:
Final_Smoking_df.to_csv('Cleaned_Smoking.csv')

<div class="alert alert-block alert-success">

# 16. Household Smoking

   * SMD460 - # of people who live here smoke tobacco?
       * Categorical 0-3
       * Game Plan:
           * NaN -> 0
           * 777 ->3
           * less then 1 -> 0

   * SMD470 - # of people who smoke inside this home?
       * Categorical 0-3
       * Game Plan:
           * NaN -> 0
           * Less than 1 -> 0

In [404]:
household_smoking_columns = ['SEQN', 'SMD460', 'SMD470']
Household_Smoking_df = Preprocess_Q_Household_Smoking_df[household_smoking_columns]
print ('Household Smoking Shape: ' + str(Household_Smoking_df.shape))
df = Household_Smoking_df
df.head()

Household Smoking Shape: (9971, 3)


Unnamed: 0,SEQN,SMD460,SMD470
0,83732.0,5.397605e-79,
1,83733.0,1.0,1.0
2,83734.0,1.0,1.0
3,83735.0,5.397605e-79,
4,83736.0,3.0,3.0


In [405]:
df = pd.merge(df, age_df, on="SEQN")
df = df.drop(df[df.Age < 18].index)
print ('Household Smoking Shape: ' + str(df.shape))
df = df.drop('Age', axis = 1)
df.head()


Household Smoking Shape: (5992, 4)


Unnamed: 0,SEQN,SMD460,SMD470
0,83732.0,5.397605e-79,
1,83733.0,1.0,1.0
2,83734.0,1.0,1.0
3,83735.0,5.397605e-79,
4,83736.0,3.0,3.0


In [406]:
print (df['SMD460'].isnull().sum(axis = 0))
print (df['SMD470'].isnull().sum(axis = 0))


214
4320


In [410]:
df['SMD460'].value_counts()
#df['SMD470'].value_counts()

5.397605e-79    4103
1.000000e+00    1100
2.000000e+00     430
3.000000e+00     142
7.770000e+02       3
Name: SMD460, dtype: int64

In [411]:
# Game Plan

col = 'SMD460'
df[col].values[df[col].values < 1] = 0
df.loc[df[col] == 777, col] = 3
df[col].fillna(0, inplace=True)


col = 'SMD470'
df[col].values[df[col].values < 1] = 0
df[col].fillna(0, inplace=True)

In [412]:
# Dummies

need_dummies = ['SMD460', 'SMD470']
df = pd.get_dummies(df, columns = need_dummies, drop_first = True)

In [414]:
Final_Household_Smoking_df = df

In [415]:
Final_Household_Smoking_df.to_csv('Cleaned_Household_Smoking.csv')

In [466]:
Final_Household_Smoking_df.shape

(5992, 7)

<div class="alert alert-block alert-warning">

# 17. Weight History

   * WHD050 - Self-reported weight - 1 yr ago (pounds)
       * Range
       * Game Plan:
           * 7777, 9999 -> Current Weight
  
   * WHD140 - Self-reported greatest weight (pounds)
       * Range
       * Game Plan:
           * 7777 -> Current Weight
           * 9999 -> Current Weight
  
   * WHQ150 - Age when heaviest weight
       * Range
       * Game Plan:
           * Nan -> Current Age
           * 99999 -> Current Age
  
   * WHQ225 - Times lost 10 lbs or more to lose weight
       * Categorical 1-5
       * Game Plan:
           * 7, 9 -> 5 ('Never', mode)
   
   * WHQ030 - How do you consider your weight
       * Categorical 1-3
       * Game Plan:
           * 7 -> 1
           * 9 -> 3

In [464]:
weight_history_columns = ['SEQN', 'WHD050', 'WHD140', 'WHQ150', 'WHQ225', 'WHQ030']
df = Preprocess_Q_Weight_History_df[weight_history_columns]
print ('Weight History Shape: ' + str(df.shape))
df.head()

Weight History Shape: (6327, 6)


Unnamed: 0,SEQN,WHD050,WHD140,WHQ150,WHQ225,WHQ030
0,83732.0,225.0,260.0,20.0,2.0,3.0
1,83733.0,191.0,205.0,51.0,5.0,1.0
2,83734.0,212.0,240.0,46.0,1.0,3.0
3,83735.0,220.0,240.0,55.0,2.0,1.0
4,83736.0,135.0,135.0,33.0,5.0,2.0


In [468]:
weight_df = pd.read_csv('Cleaned_Body_Measures.csv')
print ('Body Measures: ' + str(weight_df.shape))
weight_df.head()

Body Measures: (5735, 7)


Unnamed: 0.1,Unnamed: 0,SEQN,Weight (kg),Standing Height (cm),BMI,Waist Circumference (cm),Age
0,0,83732.0,94.8,184.5,27.8,101.1,62.0
1,1,83733.0,90.4,171.4,30.8,107.9,53.0
2,2,83734.0,83.4,170.1,28.8,116.5,78.0
3,3,83735.0,109.8,160.9,42.4,110.1,56.0
4,4,83736.0,55.2,164.9,20.3,80.4,42.0


In [467]:
df = pd.merge(df, age_df, on="SEQN")
df = df.drop(df[df.Age < 18].index)
print ('Weight History Shape: ' + str(df.shape))
# df = df.drop('Age', axis = 1)
df.head()

Weight History Shape: (5992, 7)


Unnamed: 0,SEQN,WHD050,WHD140,WHQ150,WHQ225,WHQ030,Age
0,83732.0,225.0,260.0,20.0,2.0,3.0,62.0
1,83733.0,191.0,205.0,51.0,5.0,1.0,53.0
2,83734.0,212.0,240.0,46.0,1.0,3.0,78.0
3,83735.0,220.0,240.0,55.0,2.0,1.0,56.0
4,83736.0,135.0,135.0,33.0,5.0,2.0,42.0


In [470]:
df = pd.merge(df, weight_df[['SEQN','Weight (kg)']], on="SEQN")
print ('Weight History Shape: ' + str(df.shape))
df.head()

Weight History Shape: (5735, 8)


Unnamed: 0,SEQN,WHD050,WHD140,WHQ150,WHQ225,WHQ030,Age,Weight (kg)
0,83732.0,225.0,260.0,20.0,2.0,3.0,62.0,94.8
1,83733.0,191.0,205.0,51.0,5.0,1.0,53.0,90.4
2,83734.0,212.0,240.0,46.0,1.0,3.0,78.0,83.4
3,83735.0,220.0,240.0,55.0,2.0,1.0,56.0,109.8
4,83736.0,135.0,135.0,33.0,5.0,2.0,42.0,55.2


In [473]:
print (df['WHD050'].isnull().sum(axis = 0))
print (df['WHD140'].isnull().sum(axis = 0))
print (df['WHQ150'].isnull().sum(axis = 0))
print (df['WHQ225'].isnull().sum(axis = 0))
print (df['WHQ030'].isnull().sum(axis = 0))


0
0
0
0
0


In [478]:
df['WHD050'].value_counts()
df['WHD140'].value_counts()
df['WHQ150'].value_counts()
df['WHQ225'].value_counts()
df['WHQ030'].value_counts()


1.0    2858
3.0    2532
2.0     345
Name: WHQ030, dtype: int64

In [472]:
# Game Plan

col = 'WHD050'
df.loc[df[col] == 7777, col] = np.NaN
df[col].fillna(df['Age'], inplace=True)
df.loc[df[col] == 9999, col] = np.NaN
df[col].fillna(df['Age'], inplace=True)

col = 'WHD140'

df.loc[df[col] == 7777, col] = np.NaN
df[col].fillna(df['Weight (kg)'], inplace=True)
df.loc[df[col] == 9999, col] = np.NaN
df[col].fillna(df['Weight (kg)'], inplace=True)

col = 'WHQ150'
df[col].fillna(df['Age'], inplace=True)
df.loc[df[col] == 99999, col] = np.NaN
df[col].fillna(df['Age'], inplace=True)

col = 'WHQ225'
df.loc[df[col] == 7, col] = 5
df.loc[df[col] == 9, col] = 5

col = 'WHQ030'
df.loc[df[col] == 7, col] = 1
df.loc[df[col] == 9, col] = 3


In [479]:
# Dummies

need_dummies = ['WHQ225', 'WHQ030']
df = pd.get_dummies(df, columns = need_dummies, drop_first = True)

In [480]:
df.head()

Unnamed: 0,SEQN,WHD050,WHD140,WHQ150,Age,Weight (kg),WHQ225_2.0,WHQ225_3.0,WHQ225_4.0,WHQ225_5.0,WHQ030_2.0,WHQ030_3.0
0,83732.0,225.0,260.0,20.0,62.0,94.8,1,0,0,0,0,1
1,83733.0,191.0,205.0,51.0,53.0,90.4,0,0,0,1,0,0
2,83734.0,212.0,240.0,46.0,78.0,83.4,0,0,0,0,0,1
3,83735.0,220.0,240.0,55.0,56.0,109.8,1,0,0,0,0,0
4,83736.0,135.0,135.0,33.0,42.0,55.2,0,0,0,1,1,0


In [481]:
Final_Weight_History_df = df.drop(['Age', 'Weight (kg)'], axis = 1)
Final_Weight_History_df.head()

Unnamed: 0,SEQN,WHD050,WHD140,WHQ150,WHQ225_2.0,WHQ225_3.0,WHQ225_4.0,WHQ225_5.0,WHQ030_2.0,WHQ030_3.0
0,83732.0,225.0,260.0,20.0,1,0,0,0,0,1
1,83733.0,191.0,205.0,51.0,0,0,0,1,0,0
2,83734.0,212.0,240.0,46.0,0,0,0,0,0,1
3,83735.0,220.0,240.0,55.0,1,0,0,0,0,0
4,83736.0,135.0,135.0,33.0,0,0,0,1,1,0


In [482]:
Final_Weight_History_df.to_csv('Cleaned_Weight_History.csv')

<div class="alert alert-block alert-warning">

# 17. Next Dataframe

# Hopefully Not!!


In [416]:
weight_history_columns = ['SEQN', 'WHD010', 'WHD020', 'WHD050', 'WHD080A', 'WHD080B', 'WHD080C']

Weight_History_df = Preprocess_Q_Weight_History_df[weight_history_columns]
print ('Weight History Shape: ' + str(Weight_History_df.shape))
df = Weight_History_df
df.head()

Weight History Shape: (6327, 26)


Unnamed: 0,SEQN,WHD010,WHD020,WHD050,WHD080A,WHD080B,WHD080C,WHD080D,WHD080E,WHD080F,...,WHD080N,WHD080O,WHD080P,WHD080Q,WHD080R,WHD080S,WHD080T,WHD080U,WHQ225,WHD080S.1
0,83732.0,74.0,212.0,225.0,10.0,,,13.0,,,...,,41.0,,43.0,,,46.0,,2.0,
1,83733.0,68.0,193.0,191.0,,,,,,,...,,,,,,,,,5.0,
2,83734.0,69.0,182.0,212.0,10.0,,,,,,...,,,,43.0,,,,,1.0,
3,83735.0,64.0,220.0,220.0,10.0,11.0,12.0,,14.0,15.0,...,,,,43.0,,,46.0,,2.0,
4,83736.0,64.0,125.0,135.0,,,,,,,...,,,,,,,,,5.0,


In [417]:
df = pd.merge(df, age_df, on="SEQN")
df = df.drop(df[df.Age < 18].index)
print ('Weight History Shape: ' + str(df.shape))
df = df.drop('Age', axis = 1)
df.head()

Weight History Shape: (5992, 27)


Unnamed: 0,SEQN,WHD010,WHD020,WHD050,WHD080A,WHD080B,WHD080C,WHD080D,WHD080E,WHD080F,...,WHD080N,WHD080O,WHD080P,WHD080Q,WHD080R,WHD080S,WHD080T,WHD080U,WHQ225,WHD080S.1
0,83732.0,74.0,212.0,225.0,10.0,,,13.0,,,...,,41.0,,43.0,,,46.0,,2.0,
1,83733.0,68.0,193.0,191.0,,,,,,,...,,,,,,,,,5.0,
2,83734.0,69.0,182.0,212.0,10.0,,,,,,...,,,,43.0,,,,,1.0,
3,83735.0,64.0,220.0,220.0,10.0,11.0,12.0,,14.0,15.0,...,,,,43.0,,,46.0,,2.0,
4,83736.0,64.0,125.0,135.0,,,,,,,...,,,,,,,,,5.0,


In [None]:
print (df['INQ320'].isnull().sum(axis = 0))

In [None]:
df['INQ320'].value_counts()

In [None]:
df = pd.merge(df, age_df, on="SEQN")
df = df.drop(df[df.Age < 18].index)
print ('Medical Conditions Shape: ' + str(df.shape))
df = df.drop('Age', axis = 1)
df.head()



print (df['INQ320'].isnull().sum(axis = 0))

df['INQ320'].value_counts()




col = ''
df[col].values[df[col].values < 1] = 0
df.loc[df[col] == 6666, col] = 95
df.loc[df[col] == 7777, col] = 0
df.loc[df[col] == 9999, col] = 1
df[col].fillna(0, inplace=True)
df[col] = df[col].fillna(df[col].median())


# Dummies

need_dummies = ['INQ132', 'INQ320']
df = pd.get_dummies(df, columns = need_dummies, drop_first = True)