In [149]:
#import dependencies 
import psycopg2
import pandas as pd
import numpy as np

In [3]:
from getpass import getpass
password = getpass("Enter database password")

param_dic = {
    "host" : "localhost",
    "database" : "BRFSSAnalysis",
    "user" : "postgres",
    "password" : password
}

Enter database password········


In [4]:
def connect(params_dic):
    """
    Tranform a SELECT query into a pandas dataframe
    """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

In [5]:
def postgresql_to_dataframe(conn, select_query, column_names):
    """
    SELECT * from question_info
    """
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cursor.close()
        return 1
    
    # Naturally we get a list of tupples
    tupples = cursor.fetchall()
    cursor.close()
    
    # We just need to turn it into a pandas dataframe
    df = pd.DataFrame(tupples, columns=column_names)
    return df

In [6]:
# Connect to the database
conn = connect(param_dic)
column_names = ["id", "var_name", "label", "text"]
# Execute the "SELECT *" query to save question_info as a datafram
question_info_df = postgresql_to_dataframe(conn, "select * from question_info", column_names)
question_info_df.head()

Connecting to the PostgreSQL database...
Connection successful


Unnamed: 0,id,var_name,label,text
0,1,_STATE,State FIPS Code,State FIPS Code
1,2,FMONTH,File Month,File Month
2,3,IDATE,Interview Date,Interview Date
3,4,IMONTH,Interview Month,Interview Month
4,5,IDAY,Interview Day,Interview Day


In [7]:
# Connect to the database and repeat process for question_values
conn = connect(param_dic)
column_names = ["id", "question_id", "label", "value", "value_end"]
# Execute the "SELECT *" query to save question_info as a datafram
answer_info_df = postgresql_to_dataframe(conn, "select * from question_values", column_names)
answer_info_df.head()

Connecting to the PostgreSQL database...
Connection successful


Unnamed: 0,id,question_id,label,value,value_end
0,1,1,Alabama,1,
1,2,1,Alaska,2,
2,3,1,Arizona,4,
3,4,1,Arkansas,5,
4,5,1,California,6,


In [8]:
#determing which columns of user answers (aka relevant answers) to bring over
a = question_info_df['label'].values.tolist()
b = question_info_df['var_name'].values.tolist()

In [9]:
zip_iterator = zip(b, a)
a_dictionary = dict(zip_iterator)

In [10]:
for key, value in a_dictionary.items():
    print(key, ' : ', value)

_STATE  :  State FIPS Code
FMONTH  :  File Month
IDATE  :  Interview Date
IMONTH  :  Interview Month
IDAY  :  Interview Day
IYEAR  :  Interview Year
DISPCODE  :  Final Disposition
SEQNO  :  Annual Sequence Number
_PSU  :  Primary Sampling Unit
CTELENM1  :  Correct telephone number?
PVTRESD1  :  Private Residence?
COLGHOUS  :  Do you live in college housing?
STATERE1  :  Resident of State
CELPHONE  :  Cellular Telephone
LADULT1  :  Are you 18 years of age or older?
COLGSEX  :  Are you male or female?
NUMADULT  :  Number of Adults in Household
LANDSEX  :  Are you male or female?
NUMMEN  :  Number of Adult men in Household
NUMWOMEN  :  Number of Adult women in Household
RESPSLCT  :  Respondent selection
SAFETIME  :  Safe time to talk
CTELNUM1  :  Correct Phone Number?
CELLFON5  :  Is this a cell phone?
CADULT1  :  Are you 18 years of age or older?
CELLSEX  :  Are you male or female?
PVTRESD3  :  Do you live in a private residence?
CCLGHOUS  :  Do you live in college housing?
CSTATE1  :  D

In [398]:
#select columns to include from list above for health behaviour features predicting general, mental and physical health
columns = ["id", "GENHLTH", "PHYSHLTH", "MENTHLTH", "POORHLTH", "EXRACT11", "PAFREQ1_", "_MINAC11", "ACTIN12_", "EXRACT21", "PAFREQ2_", "_MINAC21", "ACTIN22_",  "STRFREQ_", "PA2MIN_", "_METSTAT", "_URBSTAT", "HTIN4", "WTKG3", "_BMI5", "_SMOKER3", "_DRNKWK1", "FC601_", "FTJUDA2_",  "GRENDA1_", "VEGEDA2_", "POTADA1_", "FRNCHDA_", "_FRUTSU1", "_FRUTSU1", "_VEGESU1", "_PAINDX2", "_PASTRNG"]                 

In [399]:
# Connect to the database and repeat process for user_answers: limit of 500
conn = connect(param_dic)
column_names = columns
# Execute the "SELECT *" query to save question_info as a datafram
health_behaviour_df = postgresql_to_dataframe(conn, f"select id, GENHLTH, PHYSHLTH, MENTHLTH, POORHLTH, EXRACT11, PAFREQ1_, _MINAC11, ACTIN12_, EXRACT21, PAFREQ2_, _MINAC21, ACTIN22_,  STRFREQ_, PA2MIN_, _METSTAT, _URBSTAT, HTIN4, WTKG3, _BMI5, _SMOKER3, _DRNKWK1, FC601_, FTJUDA2_,  GRENDA1_, VEGEDA2_, POTADA1_, FRNCHDA_, _FRUTSU1, _FRUTSU1, _VEGESU1,  _PAINDX2, _PASTRNG from user_answers limit 500", column_names)


Connecting to the PostgreSQL database...
Connection successful


In [400]:
health_behaviour_df.head()

Unnamed: 0,id,GENHLTH,PHYSHLTH,MENTHLTH,POORHLTH,EXRACT11,PAFREQ1_,_MINAC11,ACTIN12_,EXRACT21,...,FTJUDA2_,GRENDA1_,VEGEDA2_,POTADA1_,FRNCHDA_,_FRUTSU1,_FRUTSU1.1,_VEGESU1,_PAINDX2,_PASTRNG
0,137083,3.0,5.0,30.0,5.0,14.0,233.0,56.0,2.0,38.0,...,7.0,3.0,100.0,43.0,50.0,50.0,50.0,196.0,1.0,1.0
1,137084,3.0,30.0,88.0,30.0,,,,,,...,0.0,43.0,100.0,57.0,29.0,100.0,100.0,229.0,2.0,2.0
2,137085,1.0,77.0,88.0,88.0,,,,,,...,0.0,0.0,100.0,0.0,43.0,100.0,100.0,143.0,9.0,2.0
3,137086,4.0,30.0,30.0,30.0,,,,,,...,43.0,29.0,57.0,0.0,57.0,86.0,86.0,143.0,2.0,1.0
4,137087,1.0,88.0,88.0,,64.0,,,1.0,88.0,...,2.0,0.0,0.0,14.0,100.0,31.0,31.0,114.0,9.0,2.0


In [401]:
#filling blank spaces with NaN
health_behaviour_df = health_behaviour_df.fillna(value=np.nan)

In [402]:
print(health_behaviour_df.isnull().sum())

id            0
GENHLTH       0
PHYSHLTH      0
MENTHLTH      0
POORHLTH    194
EXRACT11    140
PAFREQ1_    280
_MINAC11    278
ACTIN12_    144
EXRACT21    142
PAFREQ2_    280
_MINAC21    157
ACTIN22_    152
STRFREQ_     21
PA2MIN_     160
_METSTAT      0
_URBSTAT      0
HTIN4         8
WTKG3        32
_BMI5        39
_SMOKER3      0
_DRNKWK1      0
FC601_        0
FTJUDA2_     24
GRENDA1_     25
VEGEDA2_     30
POTADA1_     28
FRNCHDA_     26
_FRUTSU1     38
_FRUTSU1     38
_VEGESU1     40
_PAINDX2      0
_PASTRNG      0
dtype: int64


## TODO: go through features and correct poorly coded values (e.g. 9999 = no answer)
clean up DF for null values
clean up poorly encoded variables

### GENHLTH 
#### question: Would you say that in general your health is:

Answers currently coded as : 
   - 1: Excellent
   - 2: Very Good
   - 3: Good
   - 4: Fair
   - 5: Poor
   - 7: Don't know/Not Sure
   - 9: Refused
   - BLANK: Not asked or Missing
   
Recode to: 
   - Nan: Don't know/Not Sure
   - Nan: Refused
   - BLANK: Not asked or Missing 
   - 1: Poor
   - 2: Fair
   - 3: Good
   - 4: Very Good
   - 5: Excellent
   

In [420]:
#print out original gen health values for reference and check data type
health_behaviour_df['GENHLTH'].head(20)

0     3.0
1     3.0
2     1.0
3     4.0
4     1.0
5     3.0
6     2.0
7     1.0
8     1.0
9     2.0
10    2.0
11    4.0
12    4.0
13    3.0
14    3.0
15    3.0
16    3.0
17    4.0
18    3.0
19    3.0
Name: GENHLTH, dtype: object

In [403]:
#Make a copy of health_behaviour_df to perform value recoding on
recoded_health_behaviour_df = health_behaviour_df.copy()

In [356]:
#convert column to int
recoded_health_behaviour_df.GENHLTH = pd.to_numeric(recoded_health_behaviour_df.GENHLTH).astype(int)
recoded_health_behaviour_df['GENHLTH'].dtypes

dtype('int32')

In [357]:
#recode GENHLTH values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 1., "GENHLTH"] = 5
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 2., "GENHLTH"] = 4
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 3., "GENHLTH"] = 3
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 4., "GENHLTH"] = 2
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 5., "GENHLTH"] = 1
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 7., "GENHLTH"] = np.NaN
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 9., "GENHLTH"] = np.NaN

In [421]:
recoded_health_behaviour_df.GENHLTH = pd.to_numeric(recoded_health_behaviour_df.GENHLTH).astype('Int32')

In [422]:
#print to confirm values were recoded correctly
recoded_health_behaviour_df['GENHLTH'].head(20)

0     3
1     3
2     1
3     4
4     1
5     3
6     2
7     1
8     1
9     2
10    2
11    4
12    4
13    3
14    3
15    3
16    3
17    4
18    3
19    3
Name: GENHLTH, dtype: Int32

In [397]:
print(recoded_health_behaviour_df.isnull().sum())

id             0
GENHLTH        0
PHYSHLTH       0
MENTHLTH       0
POORHLTH     204
EXRACT11     164
PAFREQ1_     339
_MINAC11     336
ACTIN12_     169
EXRACT21     171
PAFREQ2_     339
_MINAC21     185
ACTIN22_     180
STRFREQ_      28
PA2MIN_      189
_METSTAT       0
_URBSTAT       0
HTIN4          5
WTKG3         17
_BMI5         21
_SMOKER3       0
_DRNKWK1       0
FC601_         0
FTJUDA2_      33
GRENDA1_      32
VEGEDA2_      39
POTADA1_      45
FRNCHDA_      37
_FRUTSU1      37
_FRUTSU1      37
_VEGESU1      55
 _PAINDX2      0
_PASTRNG       0
dtype: int64


### PA2MIN_
#### Question:  Minutes of total Physical Activity per week
Originally coded as: 
- 0-99999: Minutes of Activity per week
- BLANK: not asked or missing
    

In [418]:
#checking column values
recoded_health_behaviour_df['PA2MIN_'].head(20)

0      224.0
1        NaN
2        NaN
3        NaN
4        0.0
5        NaN
6      182.0
7        0.0
8      120.0
9     1890.0
10       NaN
11       0.0
12     180.0
13       NaN
14     252.0
15       0.0
16       0.0
17     588.0
18       NaN
19      64.0
Name: PA2MIN_, dtype: object

In [368]:
recoded_health_behaviour_df.PA2MIN_ = pd.to_numeric(recoded_health_behaviour_df.PA2MIN_).astype('Int32')
recoded_health_behaviour_df['PA2MIN_'].dtypes

Int32Dtype()

In [417]:
recoded_health_behaviour_df['PA2MIN_'].head(20)

0      224.0
1        NaN
2        NaN
3        NaN
4        0.0
5        NaN
6      182.0
7        0.0
8      120.0
9     1890.0
10       NaN
11       0.0
12     180.0
13       NaN
14     252.0
15       0.0
16       0.0
17     588.0
18       NaN
19      64.0
Name: PA2MIN_, dtype: object

### PHYSHLTH
#### Question:  Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?
Answers originally coded as: 
- 1-30: Number of Days, numeric
- 88: None
- 77: Don't know/Not sure
- 99: Refused
- Blank: Not asked or missing

Recode to:

- 1-30: Number of Days, int
- 0: None
- Nan: Don't know/Not sure
- Nan: Refused
- Blank: Not asked or missing


In [375]:
#check column data type
recoded_health_behaviour_df['PHYSHLTH'].dtypes

dtype('O')

In [376]:
recoded_health_behaviour_df['PHYSHLTH'].dtypes

dtype('O')

In [416]:
#print column values for reference
recoded_health_behaviour_df['PHYSHLTH'].head(20)

0      5.0
1     30.0
2     77.0
3     30.0
4     88.0
5     88.0
6     88.0
7     88.0
8     88.0
9      3.0
10    88.0
11    88.0
12    88.0
13    88.0
14    88.0
15    88.0
16    88.0
17    15.0
18     7.0
19    88.0
Name: PHYSHLTH, dtype: object

In [378]:
# recode PHYSHLTH values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df.PHYSHLTH == 88, "PHYSHLTH"] = 0
recoded_health_behaviour_df.loc[health_behaviour_df.PHYSHLTH == 77, "PHYSHLTH"] = np.NaN
recoded_health_behaviour_df.loc[health_behaviour_df.PHYSHLTH == 99, "PHYSHLTH"] = np.NaN

In [379]:
#convert column from object to int
recoded_health_behaviour_df.PHYSHLTH = pd.to_numeric(recoded_health_behaviour_df.PHYSHLTH).astype('Int32')

In [415]:
#reprint values to confirm they were correctly recoded
recoded_health_behaviour_df['PHYSHLTH'].head(20)

0      5.0
1     30.0
2     77.0
3     30.0
4     88.0
5     88.0
6     88.0
7     88.0
8     88.0
9      3.0
10    88.0
11    88.0
12    88.0
13    88.0
14    88.0
15    88.0
16    88.0
17    15.0
18     7.0
19    88.0
Name: PHYSHLTH, dtype: object

### MENTHLTH
#### Question:  Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?
Answers originally coded as: 
- 1-30: Number of Days, numeric
- 88: None
- 77: Don't know/Not sure
- 99: Refused
- Blank: Not asked or missing

Recode to:

- 1-30: Number of Days, int
- 0: None
- Nan: Don't know/Not sure
- Nan: Refused
- Blank: Not asked or missing


In [382]:
#check column data type
recoded_health_behaviour_df['MENTHLTH'].dtypes

dtype('O')

In [414]:
#print column values for reference
recoded_health_behaviour_df['MENTHLTH'].head(20)

0     30.0
1     88.0
2     88.0
3     30.0
4     88.0
5      1.0
6     88.0
7     88.0
8      6.0
9      5.0
10    88.0
11    88.0
12     1.0
13    88.0
14    88.0
15     5.0
16     5.0
17    88.0
18    88.0
19     1.0
Name: MENTHLTH, dtype: object

In [384]:
# recode PHYSHLTH values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df.MENTHLTH == 88, "MENTHLTH"] = 0
recoded_health_behaviour_df.loc[health_behaviour_df.MENTHLTH == 77, "MENTHLTH"] = np.NaN
recoded_health_behaviour_df.loc[health_behaviour_df.MENTHLTH == 99, "MENTHLTH"] = np.NaN

In [386]:
#convert column from object to int
recoded_health_behaviour_df.MENTHLTH = pd.to_numeric(recoded_health_behaviour_df.MENTHLTH).astype('Int32')

In [387]:
recoded_health_behaviour_df['MENTHLTH'].dtypes

Int32Dtype()

In [413]:
#reprint values to confirm they were correctly recoded
recoded_health_behaviour_df['MENTHLTH'].head(20)

0     30.0
1     88.0
2     88.0
3     30.0
4     88.0
5      1.0
6     88.0
7     88.0
8      6.0
9      5.0
10    88.0
11    88.0
12     1.0
13    88.0
14    88.0
15     5.0
16     5.0
17    88.0
18    88.0
19     1.0
Name: MENTHLTH, dtype: object

### _PAINDX2
#### Question:  Physical Activity Index
Values originally coded as:
- 1: Meet Aerobic Recommendations
- 2: Did Not Meet Aerobic Recommendations
- 9: Don’t know/Not Sure/Refused/Missing

Recode to:
- 1: Meet Aerobic Recommendations
- 2: Did Not Meet Aerobic Recommendations
- Nan: Don’t know/Not Sure/Refused/Missing
    

In [412]:
recoded_health_behaviour_df['_PAINDX2'].head(20)

0        1
1        2
2     <NA>
3        2
4     <NA>
5        2
6        1
7     <NA>
8        2
9        1
10    <NA>
11    <NA>
12       1
13       2
14       1
15    <NA>
16    <NA>
17       1
18       2
19       2
Name: _PAINDX2, dtype: Int32

In [408]:
recoded_health_behaviour_df.loc[health_behaviour_df._PAINDX2 == 9, "_PAINDX2"] = np.NaN
recoded_health_behaviour_df._PAINDX2 = pd.to_numeric(recoded_health_behaviour_df._PAINDX2).astype('Int32')

In [411]:
recoded_health_behaviour_df['_PAINDX2'].head(20)

0        1
1        2
2     <NA>
3        2
4     <NA>
5        2
6        1
7     <NA>
8        2
9        1
10    <NA>
11    <NA>
12       1
13       2
14       1
15    <NA>
16    <NA>
17       1
18       2
19       2
Name: _PAINDX2, dtype: Int32

In [None]:
#TODO: make a dataframe that contains the demographic information of the sample and their health outcomes 
#Draw up summary of sample

In [None]:
#map health outcomes accross states

In [None]:
#create a large df with demographic and health behaviour info and create model that can predict health outcomes