In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
from collections import namedtuple
from sqlalchemy import create_engine

In [2]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=True)



Current google-chrome version is 91.0.4472
Get LATEST driver version for 91.0.4472
Driver [C:\Users\elizk\.wdm\drivers\chromedriver\win32\91.0.4472.101\chromedriver.exe] found in cache


In [3]:
# Visit coding scheme page
url = 'https://www.cdc.gov/brfss/annual_data/2019/pdf/codebook19_llcp-v2-508.HTML'
browser.visit(url)

In [4]:
# Parse the HTML
html = browser.html
html_soup = soup(html, 'html.parser')

In [5]:
RowInfo = namedtuple('RowInfo', ['value', 'value_label'])
TableInfo = namedtuple('TableInfo', ['header_info', 'body_info'])
HeaderInfo = namedtuple('HeaderInfo', ['label', 'name', 'question'])

def get_value(header_line):
    index = header_line.find(':')
    label = header_line[index + 1:].replace(u'\xa0', u' ').strip()
    return label

def get_header_info(thead):
    tr = thead.find('tr')
    td = tr.find('td')
    var_label = get_value(td.contents[0])
    sas_var_names = get_value(td.contents[12])
    var_text = get_value(td.contents[16])
    return HeaderInfo(var_label, sas_var_names, var_text)
    
#Scrape SAS variable names, question numbers and question text

tables_uncut =  html_soup.find_all('table', class_='table')

table_list = []
for tables in tables_uncut[1:]:
    table_body = tables.find('tbody')
    table_header = tables.find('thead')
    header_info = get_header_info(table_header)
    rows = []
    for tr in table_body.find_all('tr'):
        row = tr.find_all('td')
        value = row[0].get_text(separator = ' ')
        value_label = row[1].get_text(separator = ' ')
        rows.append(RowInfo(value, value_label)) 
    table_list.append(TableInfo(header_info, rows))
    
  

In [6]:
table_list[22]

TableInfo(header_info=HeaderInfo(label='Correct Phone Number?', name='CTELNUM1', question='Is this     (phone number)     ?'), body_info=[RowInfo(value='1', value_label='Yes—Go to CP.03, CELLFON5'), RowInfo(value='BLANK', value_label='Not asked or Missing Notes: QSTVER < = 20')])

In [7]:

from getpass import getpass
password = getpass('Enter database password')
db = create_engine(f'postgresql://postgres:{password}@localhost:5432/BRFSSAnalysis')

Enter database password········


In [None]:
#db.execute("DROP TABLE IF EXISTS user_answers, question_values, question_info")
#db.execute("""
CREATE TABLE question_info (
    id SERIAL,
    var_name VARCHAR(8) NOT NULL,
    label TEXT NOT NULL,
    text TEXT NOT NULL,
    PRIMARY KEY (id),
	UNIQUE (var_name)
);

CREATE TABLE user_answers (
id SERIAL,
_STATE NUMERIC,
FMONTH NUMERIC,
IDATE NUMERIC,
IMONTH NUMERIC,
IDAY NUMERIC,
IYEAR NUMERIC,
DISPCODE NUMERIC,
SEQNO  NUMERIC,
_PSU  NUMERIC,
CTELENM1 NUMERIC,
PVTRESD1 NUMERIC,
COLGHOUS NUMERIC,
STATERE1 NUMERIC,
CELPHONE NUMERIC,
LADULT1 NUMERIC,
COLGSEX NUMERIC,
NUMADULT NUMERIC,
LANDSEX NUMERIC,
NUMMEN NUMERIC,
NUMWOMEN NUMERIC,
RESPSLCT NUMERIC,
SAFETIME NUMERIC,
CTELNUM1 NUMERIC,
CELLFON5 NUMERIC,
CADULT1 NUMERIC,
CELLSEX NUMERIC,
PVTRESD3 NUMERIC,
CCLGHOUS NUMERIC,
CSTATE1 NUMERIC,
LANDLINE NUMERIC,
HHADULT NUMERIC,
SEXVAR NUMERIC,
GENHLTH NUMERIC,
PHYSHLTH NUMERIC,
MENTHLTH NUMERIC,
POORHLTH NUMERIC,
HLTHPLN1 NUMERIC,
PERSDOC2 NUMERIC,
MEDCOST NUMERIC,
CHECKUP1 NUMERIC,
BPHIGH4 NUMERIC,
BPMEDS NUMERIC,
CHOLCHK2 NUMERIC,
TOLDHI2 NUMERIC,
CHOLMED2 NUMERIC,
CVDINFR4 NUMERIC,
CVDCRHD4 NUMERIC,
CVDSTRK3 NUMERIC,
ASTHMA3 NUMERIC,
ASTHNOW NUMERIC,
CHCSCNCR NUMERIC,
CHCOCNCR NUMERIC,
CHCCOPD2 NUMERIC,
ADDEPEV3 NUMERIC,
CHCKDNY2 NUMERIC,
DIABETE4 NUMERIC,
DIABAGE3 NUMERIC,
HAVARTH4 NUMERIC,
ARTHEXER NUMERIC,
ARTHEDU NUMERIC,
LMTJOIN3 NUMERIC,
ARTHDIS2 NUMERIC,
JOINPAI2 NUMERIC,
MARITAL NUMERIC,
EDUCA NUMERIC,
RENTHOM1 NUMERIC,
NUMHHOL3 NUMERIC,
NUMPHON3 NUMERIC,
CPDEMO1B NUMERIC,
VETERAN3 NUMERIC,
EMPLOY1 NUMERIC,
CHILDREN NUMERIC,
INCOME2 NUMERIC,
WEIGHT2 NUMERIC,
HEIGHT3 NUMERIC,
PREGNANT NUMERIC,
DEAF NUMERIC,
BLIND NUMERIC,
DECIDE NUMERIC,
DIFFWALK NUMERIC,
DIFFDRES NUMERIC,
DIFFALON NUMERIC,
SMOKE100 NUMERIC,
SMOKDAY2 NUMERIC,
STOPSMK2 NUMERIC,
LASTSMK2 NUMERIC,
USENOW3 NUMERIC,
ALCDAY5 NUMERIC,
AVEDRNK3 NUMERIC,
DRNK3GE5 NUMERIC,
MAXDRNKS NUMERIC,
EXERANY2 NUMERIC,
EXRACT11 NUMERIC,
EXEROFT1 NUMERIC,
EXERHMM1 NUMERIC,
EXRACT21 NUMERIC,
EXEROFT2 NUMERIC,
EXERHMM2 NUMERIC,
STRENGTH NUMERIC,
FRUIT2 NUMERIC,
FRUITJU2 NUMERIC,
FVGREEN1 NUMERIC,
FRENCHF1 NUMERIC,
POTATOE1 NUMERIC,
VEGETAB2 NUMERIC,
FLUSHOT7 NUMERIC,
FLSHTMY3 NUMERIC,
TETANUS1 NUMERIC,
PNEUVAC4 NUMERIC,
HIVTST7 NUMERIC,
HIVTSTD3 NUMERIC,
HIVRISK5 NUMERIC,
PDIABTST NUMERIC,
PREDIAB1 NUMERIC,
INSULIN1 NUMERIC,
BLDSUGAR NUMERIC,
FEETCHK3 NUMERIC,
DOCTDIAB NUMERIC,
CHKHEMO3 NUMERIC,
FEETCHK NUMERIC,
EYEEXAM1 NUMERIC,
DIABEYE NUMERIC,
DIABEDU NUMERIC,
TOLDCFS NUMERIC,
HAVECFS NUMERIC,
WORKCFS NUMERIC,
TOLDHEPC NUMERIC,
TRETHEPC NUMERIC,
PRIRHEPC NUMERIC,
HAVEHEPC NUMERIC,
HAVEHEPB NUMERIC,
MEDSHEPB NUMERIC,
HPVADVC3 NUMERIC,
HPVADSHT NUMERIC,
IMFVPLA1 NUMERIC,
SHINGLE2 NUMERIC,
LCSFIRST NUMERIC,
LCSLAST NUMERIC,
LCSNUMCG NUMERIC,
LCSCTSCN NUMERIC,
HADMAM NUMERIC,
HOWLONG NUMERIC,
HADPAP2 NUMERIC,
LASTPAP2 NUMERIC,
HPVTEST NUMERIC,
HPLSTTST NUMERIC,
HADHYST2 NUMERIC,
PCPSAAD3 NUMERIC,
PCPSADI1 NUMERIC,
PCPSARE1 NUMERIC,
PSATEST1 NUMERIC,
PSATIME NUMERIC,
PCPSARS1 NUMERIC,
PCPSADE1 NUMERIC,
PCDMDEC1 NUMERIC,
BLDSTOOL NUMERIC,
LSTBLDS3 NUMERIC,
HADSIGM3 NUMERIC,
HADSGCO1 NUMERIC,
LASTSIG3 NUMERIC,
CNCRDIFF NUMERIC,
CNCRAGE NUMERIC,
CNCRTYP1 NUMERIC,
CSRVTRT3 NUMERIC,
CSRVDOC1 NUMERIC,
CSRVSUM NUMERIC,
CSRVRTRN NUMERIC,
CSRVINST NUMERIC,
CSRVINSR NUMERIC,
CSRVDEIN NUMERIC,
CSRVCLIN NUMERIC,
CSRVPAIN NUMERIC,
CSRVCTL2 NUMERIC,
HLTHCVR1 NUMERIC,
ASPIRIN NUMERIC,
HOMBPCHK NUMERIC,
HOMRGCHK NUMERIC,
WHEREBP NUMERIC,
SHAREBP NUMERIC,
WTCHSALT NUMERIC,
DRADVISE NUMERIC,
INDORTAN NUMERIC,
NUMBURN3 NUMERIC,
SUNPRTCT NUMERIC,
WKDAYOUT NUMERIC,
WKENDOUT NUMERIC,
CIMEMLOS NUMERIC,
CDHOUSE NUMERIC,
CDASSIST NUMERIC,
CDHELP NUMERIC,
CDSOCIAL NUMERIC,
CDDISCUS NUMERIC,
CAREGIV1 NUMERIC,
CRGVREL3 NUMERIC,
CRGVLNG1 NUMERIC,
CRGVHRS1 NUMERIC,
CRGVPRB3 NUMERIC,
CRGVALZD NUMERIC,
CRGVPER1 NUMERIC,
CRGVHOU1 NUMERIC,
CRGVEXPT NUMERIC,
ACEDEPRS NUMERIC,
ACEDRINK NUMERIC,
ACEDRUGS NUMERIC,
ACEPRISN NUMERIC,
ACEDIVRC NUMERIC,
ACEPUNCH NUMERIC,
ACEHURT1 NUMERIC,
ACESWEAR NUMERIC,
ACETOUCH NUMERIC,
ACETTHEM NUMERIC,
ACEHVSEX NUMERIC,
PFPPRVN3 NUMERIC,
TYPCNTR8 NUMERIC,
NOBCUSE7 NUMERIC,
ASBIALCH NUMERIC,
ASBIDRNK NUMERIC,
ASBIBING NUMERIC,
ASBIADVC NUMERIC,
ASBIRDUC NUMERIC,
MARIJAN1 NUMERIC,
USEMRJN2 NUMERIC,
RSNMRJN1 NUMERIC,
FOODSTMP NUMERIC,
BIRTHSEX NUMERIC,
SOMALE NUMERIC,
SOFEMALE NUMERIC,
TRNSGNDR NUMERIC,
RCSGENDR NUMERIC,
RCSRLTN2 NUMERIC,
CASTHDX2 NUMERIC,
CASTHNO2 NUMERIC,
QSTVER NUMERIC,
QSTLANG NUMERIC,
_METSTAT NUMERIC,
_URBSTAT NUMERIC,
MSCODE NUMERIC,
_STSTR NUMERIC,
_STRWT  NUMERIC,
_RAWRAKE  NUMERIC,
_WT2RAKE  NUMERIC,
_IMPRACE NUMERIC,
_CHISPNC NUMERIC,
_CRACE1 NUMERIC,
_CPRACE NUMERIC,
_CLLCPWT  NUMERIC,
_DUALUSE NUMERIC,
_DUALCOR  NUMERIC,
_LLCPWT2  NUMERIC,
_LLCPWT  NUMERIC,
_RFHLTH NUMERIC,
_PHYS14D NUMERIC,
_MENT14D NUMERIC,
_HCVU651 NUMERIC,
_RFHYPE5 NUMERIC,
_CHOLCH2 NUMERIC,
_RFCHOL2 NUMERIC,
_MICHD NUMERIC,
_LTASTH1 NUMERIC,
_CASTHM1 NUMERIC,
_ASTHMS1 NUMERIC,
_DRDXAR2 NUMERIC,
_LMTACT2 NUMERIC,
_LMTWRK2 NUMERIC,
_PRACE1 NUMERIC,
_MRACE1   NUMERIC,
_HISPANC NUMERIC,
_RACE NUMERIC,
_RACEG21 NUMERIC,
_RACEGR3 NUMERIC,
_RACE_G1 NUMERIC,
_SEX NUMERIC,
_AGEG5YR NUMERIC,
_AGE65YR NUMERIC,
_AGE80 NUMERIC,
_AGE_G NUMERIC,
HTIN4 NUMERIC,
HTM4 NUMERIC,
WTKG3 NUMERIC,
_BMI5 NUMERIC,
_BMI5CAT NUMERIC,
_RFBMI5 NUMERIC,
_CHLDCNT NUMERIC,
_EDUCAG NUMERIC,
_INCOMG NUMERIC,
_SMOKER3 NUMERIC,
_RFSMOK3 NUMERIC,
DRNKANY5 NUMERIC,
DROCDY3_ NUMERIC,
_RFBING5 NUMERIC,
_DRNKWK1 NUMERIC,
_RFDRHV7 NUMERIC,
_TOTINDA NUMERIC,
METVL11_ NUMERIC,
METVL21_ NUMERIC,
MAXVO21_ NUMERIC,
FC601_ NUMERIC,
ACTIN12_ NUMERIC,
ACTIN22_ NUMERIC,
PADUR1_ NUMERIC,
PADUR2_ NUMERIC,
PAFREQ1_ NUMERIC,
PAFREQ2_ NUMERIC,
_MINAC11 NUMERIC,
_MINAC21 NUMERIC,
STRFREQ_ NUMERIC,
PAMISS2_ NUMERIC,
PAMIN12_ NUMERIC,
PAMIN22_ NUMERIC,
PA2MIN_ NUMERIC,
PAVIG12_ NUMERIC,
PAVIG22_ NUMERIC,
PA2VIGM_ NUMERIC,
_PACAT2 NUMERIC,
_PAINDX2 NUMERIC,
_PA150R3 NUMERIC,
_PA300R3 NUMERIC,
_PA30022 NUMERIC,
_PASTRNG NUMERIC,
_PAREC2 NUMERIC,
_PASTAE2 NUMERIC,
FTJUDA2_ NUMERIC,
FRUTDA2_ NUMERIC,
GRENDA1_ NUMERIC,
FRNCHDA_ NUMERIC,
POTADA1_ NUMERIC,
VEGEDA2_ NUMERIC,
_MISFRT1 NUMERIC,
_MISVEG1 NUMERIC,
_FRTRES1 NUMERIC,
_VEGRES1 NUMERIC,
_FRUTSU1 NUMERIC,
_VEGESU1 NUMERIC,
_FRTLT1A NUMERIC,
_VEGLT1A NUMERIC,
_FRT16A NUMERIC,
_VEG23A NUMERIC,
_FRUITE1 NUMERIC,
_VEGETE1 NUMERIC,
_FLSHOT7 NUMERIC,
_PNEUMO3 NUMERIC,
_AIDTST4  NUMERIC,
 PRIMARY KEY (id)
);

CREATE TABLE question_values (
    id SERIAL,
    question_id INT NOT NULL,
    label TEXT  NOT NULL,
    value NUMERIC,
    value_end NUMERIC, -- if NULL, not relevant
    FOREIGN KEY (question_id) REFERENCES question_info (id),
    PRIMARY KEY (id),
    UNIQUE (question_id, value)
);

""")

In [8]:

QUESTION_VALUE_HIDDEN = 'HIDDEN'
QUESTION_VALUE_BLANK = 'BLANK'


QuestionValueEntry = namedtuple('QuestionValueEntry', ['question_id', 'label', 'value', 'value_end'])
def generate_question_values_for_insert(question_id, row_info):
    if row_info.value.find('-') != -1:
        # range of values
        range_vals = [v.strip() for v in row_info.value.split('-')]
        assert len(range_vals) == 2
        start = int(range_vals[0])
        end = int(range_vals[1])

        return QuestionValueEntry(question_id, row_info.value_label, start, end)

    elif row_info.value == QUESTION_VALUE_HIDDEN or row_info.value == QUESTION_VALUE_BLANK:
        # There is no answer to look at, correspond to NULL
        return QuestionValueEntry(question_id, row_info.value_label, None, None)

    else:
        return QuestionValueEntry(question_id, row_info.value_label, row_info.value, None)

db.execute("""TRUNCATE user_answers, question_values, question_info""")

for table in table_list:
    # Insert question
    result = db.execute("""INSERT INTO question_info(var_name, label, text)
            VALUES (%s, %s, %s) RETURNING id""", (table.header_info.name, table.header_info.label, table.header_info.question))
    question_id = result.fetchone()[0]

    # Insert possible question values
    prev_label = None
    for row_info in table.body_info:
        entry = generate_question_values_for_insert(question_id, row_info)
        db.execute("""INSERT INTO question_values(question_id, label, value, value_end) VALUES (%s, %s, %s, %s)""", (entry.question_id, entry.label, entry.value, entry.value_end))