# Read SAS Files
The following data can be found here:
https://wwwn.cdc.gov/nchs/nhanes/

In [1]:
# Read SAS file
import pandas as pd

In [2]:
# NHANES 2001-2002 vaccination Immunization
nhanes0102_Vaccination = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/IMQ_B.XPT", format = "xport")

In [3]:
# NHANES 2001-2002 core antibody surface antigen
nhanes0102_BG = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/L02_B.XPT", format = "xport" )

In [4]:
# NHANES 2001-2002 surface antibody
nhanes0102_Antibody = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/L02HBS_B.XPT", format = "xport")

In [5]:
nhanes0102_Vaccination = nhanes0102_Vaccination[['SEQN','IMQ020']]

In [6]:
nhanes0102_BG = nhanes0102_BG[['SEQN','LBXHBC','LBDHBG']]

In [7]:
nhanes0102_Antibody

Unnamed: 0,SEQN,LBXHBS
0,9966.0,2.0
1,9967.0,2.0
2,9968.0,2.0
3,9969.0,2.0
4,9970.0,
...,...,...
9593,20999.0,2.0
9594,21000.0,2.0
9595,21002.0,2.0
9596,21003.0,1.0


In [8]:
nhanes0102_No_vaccination = nhanes0102_Vaccination[(nhanes0102_Vaccination['IMQ020']==3.0) | (nhanes0102_Vaccination['IMQ020']==7.0) | (nhanes0102_Vaccination['IMQ020']==9.0)]

In [9]:
nhanes0102_No_vaccination

Unnamed: 0,SEQN,IMQ020
0,9966.0,3.0
1,9967.0,3.0
2,9968.0,3.0
3,9969.0,3.0
6,9972.0,3.0
...,...,...
11030,20996.0,3.0
11033,20999.0,3.0
11036,21002.0,9.0
11037,21003.0,9.0


In [10]:
nhanes0102_club_1 = pd.merge(nhanes0102_No_vaccination, nhanes0102_Antibody, on="SEQN", how = "left")

In [11]:
nhanes0102_club_2 = pd.merge(nhanes0102_club_1, nhanes0102_BG, on="SEQN", how = "left")

In [12]:
import pandas as pd
import pandas.io.sql as sqlio

In [13]:
pip install pandasql

Note: you may need to restart the kernel to use updated packages.


In [14]:
from pandasql import sqldf

In [15]:
query = '''
SELECT SEQN, (CASE WHEN LBXHBC = 1.0 THEN 'Positive' WHEN LBXHBC = 2.0 THEN 'Negative' WHEN LBXHBC IS NULL THEN 'Missing' END) AS Core_Antibody,
(CASE WHEN LBDHBG = 1.0 THEN 'Positive' WHEN LBDHBG = 2.0 THEN 'Negative' WHEN LBDHBG IS NULL THEN 'Missing' END) AS Surface_Antigen,
(CASE WHEN LBXHBS = 1.0 THEN 'Positive' WHEN LBXHBS = 2.0 THEN 'Negative' WHEN LBXHBS IS NULL THEN 'Missing' END) AS Surface_Antibody,
(CASE WHEN IMQ020 = 3.0 THEN 'No Doses' WHEN IMQ020 = 7.0 THEN 'Refused' WHEN IMQ020 = 9.0 THEN "Don't know" END) AS Vaccination
FROM nhanes0102_club_2
'''

In [16]:
nhanes0102_final = sqldf(query)

In [17]:
nhanes0102_final[nhanes0102_final['Surface_Antigen'] == 'Positive']

Unnamed: 0,SEQN,Core_Antibody,Surface_Antigen,Surface_Antibody,Vaccination
426,10775.0,Positive,Positive,Negative,No Doses
624,11168.0,Positive,Positive,Negative,No Doses
1928,13655.0,Positive,Positive,Negative,Don't know
2455,14649.0,Positive,Positive,Positive,Don't know
2761,15193.0,Positive,Positive,Negative,No Doses
3561,16599.0,Positive,Positive,Negative,No Doses
3974,17391.0,Positive,Positive,Negative,No Doses
4550,18412.0,Positive,Positive,Negative,No Doses
4753,18811.0,Positive,Positive,Positive,No Doses
5598,20369.0,Positive,Positive,Negative,No Doses


In [18]:
query2 = '''
SELECT SEQN, Core_Antibody, Surface_Antigen, Surface_Antibody, Vaccination, 
(CASE WHEN Surface_Antigen = 'Positive' THEN 'HBV Infection' WHEN (Surface_Antibody = 'Positive') THEN 'Vaccine Immunity' 
WHEN (Surface_Antigen = 'Negative' and Core_Antibody ='Negative' and Surface_Antibody ='Negative') THEN 'No Infection'
WHEN (Core_Antibody = 'Positive' and Surface_Antibody = 'Negative') THEN 'HBV Exposure' END) AS resp_2 
FROM nhanes0102_final
'''

In [19]:
nhanes0102_final_resp = sqldf(query2)

In [20]:
nhanes0102_final_resp[nhanes0102_final_resp['resp_2'].isna()]

Unnamed: 0,SEQN,Core_Antibody,Surface_Antigen,Surface_Antibody,Vaccination,resp_2
6,9975.0,Missing,Missing,Missing,No Doses,
11,9981.0,Missing,Missing,Missing,No Doses,
18,10003.0,Missing,Missing,Missing,No Doses,
22,10011.0,Missing,Missing,Missing,Don't know,
27,10019.0,Missing,Missing,Missing,No Doses,
...,...,...,...,...,...,...
5911,20948.0,Missing,Missing,Missing,No Doses,
5913,20952.0,Missing,Missing,Missing,Don't know,
5916,20957.0,Missing,Missing,Missing,Don't know,
5925,20976.0,Missing,Missing,Missing,No Doses,


In [21]:
# NHANES 2017-2018 Hepatitis ever told
#nhanes9920_evertold = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/HEQ_H.XPT", format = "xport")

In [22]:
#nhanes1314_evertold = nhanes1314_evertold[['SEQN','HEQ010']]

In [23]:
# left outer join prior data with evertold
#nhanes1314_final_resp_comb = pd.merge(nhanes1314_final_resp, nhanes1314_evertold, on="SEQN", how = "left")

In [24]:
#nhanes1314_final_resp_comb

In [25]:
#query3 = '''
#SELECT SEQN, Core_Antibody, Surface_Antigen, Surface_Antibody, Vaccination, resp, HEQ010,
#(CASE WHEN resp IS NULL and HEQ010 = 1.0 THEN 'HBV Infection' WHEN resp IS NULL and HEQ010 = 2.0 THEN 'No Infection' ELSE resp END) AS resp_2
#FROM nhanes1314_final_resp_comb
#'''

In [26]:
#nhanes9920_final_resp_data = sqldf(query3)

In [27]:
nhanes0102_HBV = nhanes0102_final_resp[(nhanes0102_final_resp['resp_2']=='HBV Infection') | (nhanes0102_final_resp['resp_2']=='No Infection')]

In [28]:
nhanes0102_HBV

Unnamed: 0,SEQN,Core_Antibody,Surface_Antigen,Surface_Antibody,Vaccination,resp_2
0,9966.0,Negative,Negative,Negative,No Doses,No Infection
1,9967.0,Negative,Negative,Negative,No Doses,No Infection
2,9968.0,Negative,Negative,Negative,No Doses,No Infection
3,9969.0,Negative,Negative,Negative,No Doses,No Infection
4,9972.0,Negative,Negative,Negative,No Doses,No Infection
...,...,...,...,...,...,...
5935,20995.0,Negative,Negative,Negative,No Doses,No Infection
5936,20996.0,Negative,Negative,Negative,No Doses,No Infection
5937,20999.0,Negative,Negative,Negative,No Doses,No Infection
5938,21002.0,Negative,Negative,Negative,Don't know,No Infection


In [29]:
# NHANES 2017-2018 Demographic Variable
nhanes0102_Demographic = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/DEMO_B.XPT", format = "xport")

In [30]:
#changed columns 'RIDRETH1','DMQMILIT','DMDBORN', deleted 'DMQADFC' 
Demographic_Variables = ['SEQN','SDDSRVYR','RIAGENDR','RIDAGEYR','RIDRETH1','DMQMILIT','DMDBORN','DMDCITZN',
                        'DMDYRSUS','DMDEDUC2','DMDMARTL','WTINT2YR','WTMEC2YR','SDMVPSU','SDMVSTRA','INDFMPIR']

In [31]:
nhanes0102_Demographic = nhanes0102_Demographic[Demographic_Variables]

In [32]:
nhanes0102_HBV_Demo = pd.merge(nhanes0102_HBV, nhanes0102_Demographic, on="SEQN", how = "left")

In [33]:
nhanes0102_HBV_Demo

Unnamed: 0,SEQN,Core_Antibody,Surface_Antigen,Surface_Antibody,Vaccination,resp_2,SDDSRVYR,RIAGENDR,RIDAGEYR,RIDRETH1,...,DMDBORN,DMDCITZN,DMDYRSUS,DMDEDUC2,DMDMARTL,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDFMPIR
0,9966.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,39.0,3.0,...,1.0,1.0,,4.0,3.0,85045.160060,91352.991726,2.0,22.0,2.93
1,9967.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,23.0,4.0,...,3.0,1.0,4.0,4.0,5.0,29465.456810,29456.680208,1.0,24.0,
2,9968.0,Negative,Negative,Negative,No Doses,No Infection,2.0,2.0,84.0,3.0,...,1.0,1.0,,2.0,2.0,20658.109377,27508.137821,2.0,20.0,0.68
3,9969.0,Negative,Negative,Negative,No Doses,No Infection,2.0,2.0,51.0,3.0,...,1.0,1.0,,5.0,1.0,75077.431586,78536.315892,2.0,18.0,5.00
4,9972.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,44.0,3.0,...,1.0,1.0,,3.0,1.0,93545.001858,93558.934760,1.0,26.0,4.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4096,20995.0,Negative,Negative,Negative,No Doses,No Infection,2.0,2.0,81.0,3.0,...,1.0,1.0,,4.0,1.0,14760.619349,18042.559276,1.0,17.0,2.76
4097,20996.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,77.0,3.0,...,1.0,1.0,,2.0,1.0,11870.275360,14213.310985,2.0,14.0,1.36
4098,20999.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,85.0,3.0,...,1.0,1.0,,3.0,1.0,9490.963134,10127.416649,2.0,22.0,0.80
4099,21002.0,Negative,Negative,Negative,Don't know,No Infection,2.0,1.0,10.0,5.0,...,1.0,1.0,,,,35628.840542,37077.391657,2.0,23.0,0.65


In [34]:
nhanes0102_Insurance = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/HIQ_B.XPT", format = "xport")

In [35]:
Insurance_Variables = ['SEQN','HID010','HID030A','HID030B','HID030C','HID030D','HID030E','HID040','HIQ210','HIQ220']

In [36]:
nhanes0102_Insurance = nhanes0102_Insurance[Insurance_Variables]

In [37]:
nhanes0102_HBV_Demo_Insurance = pd.merge(nhanes0102_HBV_Demo, nhanes0102_Insurance, on="SEQN", how = "left")

In [38]:
query4 = '''
SELECT *, 
(CASE WHEN HID010 = 2.0 THEN 1 ELSE 0 END) AS No_Insurance,
(CASE WHEN HID010 = 1.0 and HID030A = 1.0 THEN 1 ELSE 0 END) AS Private_Insurance,
(CASE WHEN HID010 = 1.0 and (HID030A IS NOT NULL OR HID030B IS NOT NULL OR HID030C IS NOT NULL OR HID030D IS NOT NULL OR 
HID030E IS NOT NULL) THEN 1 ELSE 0 END) Governement_Insurance
FROM nhanes0102_HBV_Demo_Insurance
'''

In [39]:
nhanes0102_HBV_Demo_Insurance = sqldf(query4)

In [40]:
nhanes0102_HBV_Demo_Insurance = nhanes0102_HBV_Demo_Insurance.drop(columns=['HID010','HID030A','HID030B','HID030C','HID030D','HID030E','HID040','HIQ210','HIQ220'])

In [41]:
nhanes0102_HBV_Demo_Insurance

Unnamed: 0,SEQN,Core_Antibody,Surface_Antigen,Surface_Antibody,Vaccination,resp_2,SDDSRVYR,RIAGENDR,RIDAGEYR,RIDRETH1,...,DMDEDUC2,DMDMARTL,WTINT2YR,WTMEC2YR,SDMVPSU,SDMVSTRA,INDFMPIR,No_Insurance,Private_Insurance,Governement_Insurance
0,9966.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,39.0,3.0,...,4.0,3.0,85045.160060,91352.991726,2.0,22.0,2.93,0,1,1
1,9967.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,23.0,4.0,...,4.0,5.0,29465.456810,29456.680208,1.0,24.0,,0,1,1
2,9968.0,Negative,Negative,Negative,No Doses,No Infection,2.0,2.0,84.0,3.0,...,2.0,2.0,20658.109377,27508.137821,2.0,20.0,0.68,0,0,1
3,9969.0,Negative,Negative,Negative,No Doses,No Infection,2.0,2.0,51.0,3.0,...,5.0,1.0,75077.431586,78536.315892,2.0,18.0,5.00,0,1,1
4,9972.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,44.0,3.0,...,3.0,1.0,93545.001858,93558.934760,1.0,26.0,4.74,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4096,20995.0,Negative,Negative,Negative,No Doses,No Infection,2.0,2.0,81.0,3.0,...,4.0,1.0,14760.619349,18042.559276,1.0,17.0,2.76,0,0,1
4097,20996.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,77.0,3.0,...,2.0,1.0,11870.275360,14213.310985,2.0,14.0,1.36,0,1,1
4098,20999.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,85.0,3.0,...,3.0,1.0,9490.963134,10127.416649,2.0,22.0,0.80,0,0,1
4099,21002.0,Negative,Negative,Negative,Don't know,No Infection,2.0,1.0,10.0,5.0,...,,,35628.840542,37077.391657,2.0,23.0,0.65,0,0,1


In [42]:
#Alcohol use
nhanes0102_Alcohol = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/ALQ_B.XPT", format = "xport")

In [43]:
nhanes0102_Alcohol = nhanes0102_Alcohol[['SEQN','ALQ130']]

In [44]:
nhanes0102 = pd.merge(nhanes0102_HBV_Demo_Insurance, nhanes0102_Alcohol, on="SEQN", how = "left")

In [45]:
#Healthcare Visits
nhanes0102_Healthcare = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/HUQ_B.XPT", format = "xport")

In [46]:
nhanes0102_Healthcare = nhanes0102_Healthcare [['SEQN','HUQ050']]

In [47]:
nhanes0102 = pd.merge(nhanes0102, nhanes0102_Healthcare, on="SEQN", how = "left")

In [48]:
#Dialysis
nhanes0102_Dialysis = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/KIQ_U_B.XPT", format = "xport")

In [49]:
nhanes0102_Dialysis = nhanes0102_Dialysis[['SEQN','KIQ025']]

In [50]:
nhanes0102 = pd.merge(nhanes0102, nhanes0102_Dialysis, on="SEQN", how = "left")

In [51]:
#Liver Condition
nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")

  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov

  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov

  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Liver_Condition = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")


In [52]:
nhanes0102_Liver_Condition = nhanes0102_Liver_Condition[['SEQN','MCQ160L']]

In [53]:
nhanes0102 = pd.merge(nhanes0102, nhanes0102_Liver_Condition, on="SEQN", how = "left")

In [54]:
#Blood Transfusion
nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")

  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("ht

  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("ht

  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")
  nhanes0102_Blood_Transfusion = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/MCQ_B.XPT", format = "xport")


In [55]:
nhanes0102_Blood_Transfusion = nhanes0102_Blood_Transfusion[['SEQN','MCQ092']]

In [56]:
nhanes0102 = pd.merge(nhanes0102, nhanes0102_Blood_Transfusion, on="SEQN", how = "left")

In [57]:
#Healthcare Visits time
nhanes0102_Healthcare_time = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/HUQ_B.XPT", format = "xport")

In [58]:
nhanes0102_Healthcare_time = nhanes0102_Healthcare_time[['SEQN','HUQ060']]

In [59]:
nhanes0102 = pd.merge(nhanes0102, nhanes0102_Healthcare_time, on="SEQN", how = "left")

In [60]:
#Hepatitis C
nhanes0102_Hepatitis_C = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/HCQ_B.XPT", format = "xport")

In [61]:
nhanes0102_Hepatitis_C = nhanes0102_Hepatitis_C[['SEQN']]

In [62]:
import numpy as np
nhanes0102_Hepatitis_C['Hepatitis_C'] = "Yes"

In [63]:
nhanes0102 = pd.merge(nhanes0102, nhanes0102_Hepatitis_C, on="SEQN", how = "left")

In [64]:
Biochemistry_Variables = ['SEQN','LBXSATSI','LBDSALSI','LBXSASSI']

In [65]:
#Biochemistry_Variables
nhanes0102_Biochemistry_Variables = pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/L40_B.XPT", format = "xport")

In [66]:
nhanes0102_Biochemistry_Variables = nhanes0102_Biochemistry_Variables[Biochemistry_Variables]

In [67]:
nhanes0102 = pd.merge(nhanes0102, nhanes0102_Biochemistry_Variables, on="SEQN", how = "left")

In [68]:
#Platelet count
nhanes0102_Platelet_count= pd.read_sas("https://wwwn.cdc.gov/Nchs/Nhanes/2001-2002/L25_B.XPT", format = "xport")

In [69]:
nhanes0102_Platelet_count = nhanes0102_Platelet_count[['SEQN','LBXPLTSI']]

In [70]:
nhanes0102 = pd.merge(nhanes0102, nhanes0102_Platelet_count, on="SEQN", how = "left")

In [71]:
nhanes0102 = nhanes0102[nhanes0102['RIDAGEYR']>=18]

In [87]:
pd.set_option('display.max_columns', None)
nhanes0102.head()

Unnamed: 0,Respondent_sequence_number,Core_Antibody,Surface_Antigen,Surface_Antibody,Vaccination,resp_2,Data_release_cycle,Gender,Age,Race_Ethnicity,Veteran_Status,Country_of_birth,Citizenship_status,Length_of_time_in_US,Education_level,Marital_status,Two_year_interview_weight,Two_year_MEC_weight,Masked_variance_PSU,Masked_variance_stratum,Ratio_income_poverty,No_Insurance,Private_Insurance,Governement_Insurance,Alcohol_1_year,Health_care_1_year,Dialysis_1_year,liver_condition,Blood_Transfusion,Last_Healthcare_Visit,Hepatitis_C,ALT,Albumin,AST,Platelet_Count
0,9966.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,39.0,3.0,2.0,1.0,1.0,,4.0,3.0,85045.16006,91352.991726,2.0,22.0,2.93,0,1,1,2.0,1.0,,2.0,2.0,,,20.0,41.0,24.0,368.0
1,9967.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,23.0,4.0,2.0,3.0,1.0,4.0,4.0,5.0,29465.45681,29456.680208,1.0,24.0,,0,1,1,1.0,2.0,,2.0,2.0,,,54.0,45.0,36.0,247.0
2,9968.0,Negative,Negative,Negative,No Doses,No Infection,2.0,2.0,84.0,3.0,2.0,1.0,1.0,,2.0,2.0,20658.109377,27508.137821,2.0,20.0,0.68,0,0,1,1.0,2.0,,2.0,2.0,,,12.0,38.0,19.0,305.0
3,9969.0,Negative,Negative,Negative,No Doses,No Infection,2.0,2.0,51.0,3.0,2.0,1.0,1.0,,5.0,1.0,75077.431586,78536.315892,2.0,18.0,5.0,0,1,1,2.0,3.0,,2.0,2.0,,,21.0,46.0,25.0,239.0
4,9972.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,44.0,3.0,2.0,1.0,1.0,,3.0,1.0,93545.001858,93558.93476,1.0,26.0,4.74,0,1,1,,1.0,,2.0,2.0,,,65.0,43.0,49.0,318.0


In [91]:
nhanes0102[nhanes0102['Health_care_1_year']<1.0]

Unnamed: 0,Respondent_sequence_number,Core_Antibody,Surface_Antigen,Surface_Antibody,Vaccination,resp_2,Data_release_cycle,Gender,Age,Race_Ethnicity,Veteran_Status,Country_of_birth,Citizenship_status,Length_of_time_in_US,Education_level,Marital_status,Two_year_interview_weight,Two_year_MEC_weight,Masked_variance_PSU,Masked_variance_stratum,Ratio_income_poverty,No_Insurance,Private_Insurance,Governement_Insurance,Alcohol_1_year,Health_care_1_year,Dialysis_1_year,liver_condition,Blood_Transfusion,Last_Healthcare_Visit,Hepatitis_C,ALT,Albumin,AST,Platelet_Count
6,9976.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,36.0,3.0,1.0,1.0,1.0,,4.0,6.0,36890.158715,38187.811515,2.0,24.0,5.397605e-79,1,0,0,3.0,5.397605e-79,,2.0,2.0,3.0,Yes,80.0,45.0,83.0,249.0
12,10001.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,28.0,1.0,2.0,2.0,2.0,3.0,2.0,1.0,25219.139753,25308.120576,2.0,16.0,1.800000e-01,1,0,0,5.0,5.397605e-79,,2.0,2.0,4.0,,30.0,48.0,25.0,244.0
22,10024.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,40.0,3.0,2.0,1.0,1.0,,4.0,5.0,91543.856806,92967.953387,2.0,18.0,2.510000e+00,0,1,1,2.0,5.397605e-79,,2.0,2.0,3.0,,15.0,38.0,16.0,264.0
31,10052.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,24.0,1.0,2.0,1.0,1.0,,3.0,5.0,17053.055646,17433.169930,2.0,28.0,2.730000e+00,0,1,1,4.0,5.397605e-79,,2.0,2.0,3.0,,28.0,44.0,18.0,220.0
41,10083.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,38.0,4.0,2.0,1.0,1.0,,3.0,1.0,36848.537473,37509.223546,2.0,22.0,3.310000e+00,0,1,1,6.0,5.397605e-79,,2.0,2.0,3.0,,21.0,43.0,22.0,247.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4092,20986.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,44.0,3.0,1.0,1.0,1.0,,4.0,3.0,72469.744912,77844.853282,2.0,17.0,5.000000e+00,0,1,1,4.0,5.397605e-79,,2.0,1.0,4.0,,22.0,48.0,22.0,224.0
4093,20988.0,Negative,Negative,Negative,No Doses,No Infection,2.0,1.0,40.0,4.0,1.0,1.0,1.0,,3.0,3.0,20476.869977,20973.448074,1.0,27.0,2.150000e+00,0,1,1,12.0,5.397605e-79,,2.0,2.0,3.0,,18.0,45.0,24.0,234.0
4094,20992.0,Negative,Negative,Negative,No Doses,No Infection,2.0,2.0,61.0,1.0,2.0,1.0,1.0,,4.0,1.0,3613.505353,3654.821486,1.0,17.0,3.770000e+00,0,0,1,1.0,5.397605e-79,,2.0,2.0,3.0,,23.0,46.0,19.0,315.0
4095,20994.0,Negative,Negative,Negative,No Doses,No Infection,2.0,2.0,67.0,4.0,2.0,1.0,1.0,,4.0,1.0,9859.996659,13039.051383,2.0,26.0,5.000000e+00,0,1,1,1.0,5.397605e-79,,2.0,2.0,2.0,,20.0,41.0,19.0,373.0


In [78]:
nhanes0102 = nhanes0102.rename(columns = {'SEQN':'Respondent_sequence_number',
                            'SDDSRVYR':'Data_release_cycle',
                            'RIAGENDR':'Gender',
                            'RIDAGEYR':'Age',
                            'RIDRETH1':'Race_Ethnicity',
                            'DMQMILIT':'Veteran_Status',
                            'DMDBORN':'Country_of_birth',
                            'DMDCITZN':'Citizenship_status',
                            'DMDYRSUS':'Length_of_time_in_US',
                            'DMDEDUC2':'Education_level',
                            'DMDMARTL':'Marital_status',
                            'WTINT2YR':'Two_year_interview_weight',
                            'WTMEC2YR':'Two_year_MEC_weight',
                            'SDMVPSU':'Masked_variance_PSU',
                            'SDMVSTRA':'Masked_variance_stratum',
                            'INDFMPIR':'Ratio_income_poverty',
                            'ALQ130':'Alcohol_1_year',
                            'HUQ050':'Health_care_1_year',
                            'KIQ025':'Dialysis_1_year',
                            'MCQ160L':'liver_condition',
                            'MCQ092':'Blood_Transfusion',
                            'HUQ060':'Last_Healthcare_Visit',
                            'Hepatitis_C':'Hepatitis_C',
                            'LBXSATSI':'ALT',
                            'LBDSALSI':'Albumin',
                            'LBXSASSI':'AST',
                            'LBXPLTSI':'Platelet_Count'})

In [81]:
nhanes0102[nhanes0102['resp_2']=='HBV Infection'].shape

(10, 35)

In [93]:
column_mapping = {
    'Gender':{1.0:"male", 2.0:"Female", np.nan:"missing"},
    'Race_Ethnicity':{1.0:"Mexican American",2.0:"Other Hispanic",3.0:"Non-Hispanic White",4.0:"Non-Hispanic Black",5.0:"Other Race - Including Multi-Racial"},
    'Veteran_Status':{1.0:"Yes",2.0:"No",7.0:"Refused",9.0:"Don't know", np.nan:"missing"},
    'Country_of_birth':{1.0:"US", 2.0:"Mexico", 3.0:"Elsewhere",7.0:"Refused",9.0:"Don't know",np.nan:"missing"},
    'Citizenship_status':{1.0:"Citizen",2.0:"Not a citizen",7.0:"Refused",9.0:"Don't know",np.nan:"Missing"},
    'Length_of_time_in_US':{1.0:"Less than 1 year",2.0:"1-5",3.0:"5-10",4.0:"10-15",5.0:"15-20",6.0:"20-30",7.0:"30-40",8.0:"40-50",9.0:"50 or more",77.0:"Refused",88.0:"Could not determine",99.0:"Don't know",np.nan:"Missing"},
    'Education_level':{1.0:"Less Than 9th Grade",2.0:"9-12th Grade",3.0:"High School Grad/GED or Equivalent",4.0:"Some College or AA degree",5.0:"College Graduate or above",7.0:"Refused",9.0:"Don't Know",np.nan:"Missing"},
    'Marital_status':{1.0:"Married",2.0:"Widowed",3.0:"Divorced",4.0:"Separated",5.0:"Never married",6.0:"Living with partner",77.0:"Refused",99.0:"Don't know",np.nan:"Missing"},
    'Health_care_1_year':{5.397605e-79:"None",1.0:"1",2.0:"2 to 3",3.0:"4 to 9",4.0:"10 to 12",5.0:"13 or more",77.0:"Refused",99.0:"Don't know",np.nan:"Missing"},
    'Dialysis_1_year':{1.0:"Yes",2.0:"No",7.0:"Refused",9.0:"Don't know",np.nan:"Missing"},
    'liver_condition':{1.0:"Yes",2.0:"No",7.0:"Refused",9.0:"Don't know",np.nan:"Missing"},
    'Blood_Transfusion':{1.0:"Yes",2.0:"No",7.0:"Refused",9.0:"Don't know",np.nan:"Missing"},
    'Last_Healthcare_Visit':{1.0:"6 months or less",2.0:"More than 6 months, but not more than 1 year ago",3.0:"More than 1 year, but not more than 3 years ago",4.0:"More than 3 years",5.0:"Never",7.0:"Refused",9.0:"Don't know",np.nan:"Missing"},
    'Hepatitis_C':{'Yes':"Yes",np.nan:"Missing"}
}

nhanes0102.replace(column_mapping,inplace=True)

In [94]:
nhanes0102

Unnamed: 0,Respondent_sequence_number,Core_Antibody,Surface_Antigen,Surface_Antibody,Vaccination,resp_2,Data_release_cycle,Gender,Age,Race_Ethnicity,Veteran_Status,Country_of_birth,Citizenship_status,Length_of_time_in_US,Education_level,Marital_status,Two_year_interview_weight,Two_year_MEC_weight,Masked_variance_PSU,Masked_variance_stratum,Ratio_income_poverty,No_Insurance,Private_Insurance,Governement_Insurance,Alcohol_1_year,Health_care_1_year,Dialysis_1_year,liver_condition,Blood_Transfusion,Last_Healthcare_Visit,Hepatitis_C,ALT,Albumin,AST,Platelet_Count
0,9966.0,Negative,Negative,Negative,No Doses,No Infection,2.0,male,39.0,Non-Hispanic White,No,US,Citizen,Missing,Some College or AA degree,Divorced,85045.160060,91352.991726,2.0,22.0,2.93,0,1,1,2.0,1,Missing,No,No,Missing,Missing,20.0,41.0,24.0,368.0
1,9967.0,Negative,Negative,Negative,No Doses,No Infection,2.0,male,23.0,Non-Hispanic Black,No,Elsewhere,Citizen,10-15,Some College or AA degree,Never married,29465.456810,29456.680208,1.0,24.0,,0,1,1,1.0,2 to 3,Missing,No,No,Missing,Missing,54.0,45.0,36.0,247.0
2,9968.0,Negative,Negative,Negative,No Doses,No Infection,2.0,Female,84.0,Non-Hispanic White,No,US,Citizen,Missing,9-12th Grade,Widowed,20658.109377,27508.137821,2.0,20.0,0.68,0,0,1,1.0,2 to 3,Missing,No,No,Missing,Missing,12.0,38.0,19.0,305.0
3,9969.0,Negative,Negative,Negative,No Doses,No Infection,2.0,Female,51.0,Non-Hispanic White,No,US,Citizen,Missing,College Graduate or above,Married,75077.431586,78536.315892,2.0,18.0,5.00,0,1,1,2.0,4 to 9,Missing,No,No,Missing,Missing,21.0,46.0,25.0,239.0
4,9972.0,Negative,Negative,Negative,No Doses,No Infection,2.0,male,44.0,Non-Hispanic White,No,US,Citizen,Missing,High School Grad/GED or Equivalent,Married,93545.001858,93558.934760,1.0,26.0,4.74,0,1,1,,1,Missing,No,No,Missing,Missing,65.0,43.0,49.0,318.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4095,20994.0,Negative,Negative,Negative,No Doses,No Infection,2.0,Female,67.0,Non-Hispanic Black,No,US,Citizen,Missing,Some College or AA degree,Married,9859.996659,13039.051383,2.0,26.0,5.00,0,1,1,1.0,0.0,Missing,No,No,"More than 6 months, but not more than 1 year ago",Missing,20.0,41.0,19.0,373.0
4096,20995.0,Negative,Negative,Negative,No Doses,No Infection,2.0,Female,81.0,Non-Hispanic White,No,US,Citizen,Missing,Some College or AA degree,Married,14760.619349,18042.559276,1.0,17.0,2.76,0,0,1,1.0,4 to 9,Missing,No,No,Missing,Missing,23.0,47.0,22.0,314.0
4097,20996.0,Negative,Negative,Negative,No Doses,No Infection,2.0,male,77.0,Non-Hispanic White,No,US,Citizen,Missing,9-12th Grade,Married,11870.275360,14213.310985,2.0,14.0,1.36,0,1,1,,4 to 9,Missing,No,Yes,Missing,Missing,13.0,42.0,18.0,273.0
4098,20999.0,Negative,Negative,Negative,No Doses,No Infection,2.0,male,85.0,Non-Hispanic White,Yes,US,Citizen,Missing,High School Grad/GED or Equivalent,Married,9490.963134,10127.416649,2.0,22.0,0.80,0,0,1,,0.0,Missing,No,No,"More than 1 year, but not more than 3 years ago",Missing,18.0,42.0,25.0,227.0
