In [1]:
#import dependencies 
import psycopg2
import pandas as pd
import numpy as np

In [2]:
from getpass import getpass
password = getpass("Enter database password")

param_dic = {
    "host" : "localhost",
    "database" : "BRFSSAnalysis",
    "user" : "postgres",
    "password" : password
}

Enter database password········


In [3]:
def connect(params_dic):
    """
    Tranform a SELECT query into a pandas dataframe
    """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

In [4]:
def postgresql_to_dataframe(conn, select_query, column_names):
    """
    SELECT * from question_info
    """
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cursor.close()
        return 1
    
    # Naturally we get a list of tupples
    tupples = cursor.fetchall()
    cursor.close()
    
    # We just need to turn it into a pandas dataframe
    df = pd.DataFrame(tupples, columns=column_names)
    return df

In [5]:
# Connect to the database
conn = connect(param_dic)
column_names = ["id", "var_name", "label", "text"]
# Execute the "SELECT *" query to save question_info as a datafram
question_info_df = postgresql_to_dataframe(conn, "select * from question_info", column_names)
question_info_df.head()

Connecting to the PostgreSQL database...
Connection successful


Unnamed: 0,id,var_name,label,text
0,1,_STATE,State FIPS Code,State FIPS Code
1,2,FMONTH,File Month,File Month
2,3,IDATE,Interview Date,Interview Date
3,4,IMONTH,Interview Month,Interview Month
4,5,IDAY,Interview Day,Interview Day


In [6]:
# Connect to the database and repeat process for question_values
conn = connect(param_dic)
column_names = ["id", "question_id", "label", "value", "value_end"]
# Execute the "SELECT *" query to save question_info as a datafram
answer_info_df = postgresql_to_dataframe(conn, "select * from question_values", column_names)
answer_info_df.head()

Connecting to the PostgreSQL database...
Connection successful


Unnamed: 0,id,question_id,label,value,value_end
0,1,1,Alabama,1,
1,2,1,Alaska,2,
2,3,1,Arizona,4,
3,4,1,Arkansas,5,
4,5,1,California,6,


In [7]:
#determing which columns of user answers (aka relevant answers) to bring over
a = question_info_df['label'].values.tolist()
b = question_info_df['var_name'].values.tolist()

In [8]:
zip_iterator = zip(b, a)
a_dictionary = dict(zip_iterator)

In [9]:
for key, value in a_dictionary.items():
    print(key, ' : ', value)

_STATE  :  State FIPS Code
FMONTH  :  File Month
IDATE  :  Interview Date
IMONTH  :  Interview Month
IDAY  :  Interview Day
IYEAR  :  Interview Year
DISPCODE  :  Final Disposition
SEQNO  :  Annual Sequence Number
_PSU  :  Primary Sampling Unit
CTELENM1  :  Correct telephone number?
PVTRESD1  :  Private Residence?
COLGHOUS  :  Do you live in college housing?
STATERE1  :  Resident of State
CELPHONE  :  Cellular Telephone
LADULT1  :  Are you 18 years of age or older?
COLGSEX  :  Are you male or female?
NUMADULT  :  Number of Adults in Household
LANDSEX  :  Are you male or female?
NUMMEN  :  Number of Adult men in Household
NUMWOMEN  :  Number of Adult women in Household
RESPSLCT  :  Respondent selection
SAFETIME  :  Safe time to talk
CTELNUM1  :  Correct Phone Number?
CELLFON5  :  Is this a cell phone?
CADULT1  :  Are you 18 years of age or older?
CELLSEX  :  Are you male or female?
PVTRESD3  :  Do you live in a private residence?
CCLGHOUS  :  Do you live in college housing?
CSTATE1  :  D

In [10]:
#select columns to include from list above for health behaviour features predicting general, mental and physical health
columns = ["id", "_STATE", "GENHLTH", "PHYSHLTH", "MENTHLTH", "POORHLTH", "EXRACT11", "PAFREQ1_", "_MINAC11", "ACTIN12_", "EXRACT21", "PAFREQ2_", "_MINAC21", "ACTIN22_",  "STRFREQ_", "PA2MIN_", "_METSTAT", "_URBSTAT", "HTIN4", "WTKG3", "_RFBMI5", "_RFSMOK3", "_RFDRHV7", "FC601_", "FTJUDA2_",  "GRENDA1_", "VEGEDA2_", "POTADA1_", "FRNCHDA_", "_FRUTSU1", "_VEGESU1", "_PAINDX2", "_PASTRNG", "_AGE80"]                 

In [11]:
# Connect to the database and repeat process for user_answers: limit of 500
conn = connect(param_dic)
column_names = columns
# Execute the "SELECT *" query to save question_info as a datafram
health_behaviour_df = postgresql_to_dataframe(conn, f"select id, _STATE, GENHLTH, PHYSHLTH, MENTHLTH, POORHLTH, EXRACT11, PAFREQ1_, _MINAC11, ACTIN12_, EXRACT21, PAFREQ2_, _MINAC21, ACTIN22_,  STRFREQ_, PA2MIN_, _METSTAT, _URBSTAT, HTIN4, WTKG3, _RFBMI5, _RFSMOK3, _RFDRHV7, FC601_, FTJUDA2_,  GRENDA1_, VEGEDA2_, POTADA1_, FRNCHDA_, _FRUTSU1, _VEGESU1,  _PAINDX2, _PASTRNG, _AGE80 from user_answers", column_names)


Connecting to the PostgreSQL database...
Connection successful


In [12]:
health_behaviour_df.head()

Unnamed: 0,id,_STATE,GENHLTH,PHYSHLTH,MENTHLTH,POORHLTH,EXRACT11,PAFREQ1_,_MINAC11,ACTIN12_,...,FTJUDA2_,GRENDA1_,VEGEDA2_,POTADA1_,FRNCHDA_,_FRUTSU1,_VEGESU1,_PAINDX2,_PASTRNG,_AGE80
0,31580,6.0,3.0,88.0,88.0,,64.0,7000.0,1260.0,1.0,...,,,,,,,,1.0,2.0,46.0
1,31581,6.0,4.0,2.0,5.0,88.0,18.0,467.0,28.0,2.0,...,43.0,43.0,29.0,3.0,14.0,114.0,89.0,2.0,2.0,57.0
2,31582,6.0,2.0,3.0,10.0,10.0,64.0,6000.0,210.0,1.0,...,0.0,33.0,100.0,13.0,33.0,10.0,179.0,1.0,2.0,58.0
3,31583,6.0,3.0,88.0,5.0,88.0,64.0,5000.0,200.0,1.0,...,17.0,10.0,71.0,3.0,29.0,46.0,113.0,1.0,2.0,35.0
4,31584,6.0,3.0,88.0,30.0,15.0,64.0,4000.0,120.0,1.0,...,57.0,3.0,43.0,14.0,29.0,100.0,89.0,1.0,1.0,23.0


In [13]:
#filling blank spaces with NaN
health_behaviour_df = health_behaviour_df.fillna(value=np.nan)

In [14]:
#Make a copy of health_behaviour_df to perform value recoding on
recoded_health_behaviour_df = health_behaviour_df.copy()

In [15]:
print(health_behaviour_df.isnull().sum())

id               0
_STATE           0
GENHLTH         26
PHYSHLTH        32
MENTHLTH        19
POORHLTH    188931
EXRACT11    130359
PAFREQ1_    140516
_MINAC11    146930
ACTIN12_    135951
EXRACT21    134658
PAFREQ2_    237530
_MINAC21    147892
ACTIN22_    143575
STRFREQ_     37082
PA2MIN_     140384
_METSTAT      8458
_URBSTAT      8458
HTIN4        20355
WTKG3        29748
_RFBMI5          0
_RFSMOK3         0
_RFDRHV7         0
FC601_           0
FTJUDA2_     38440
GRENDA1_     38902
VEGEDA2_     43369
POTADA1_     42224
FRNCHDA_     38866
_FRUTSU1     44600
_VEGESU1     53430
_PAINDX2         0
_PASTRNG         0
_AGE80           0
dtype: int64


## TODO: go through features and correct poorly coded values (e.g. 9999 = no answer)
clean up DF for null values
clean up poorly encoded variables

### GENHLTH 
#### question: Would you say that in general your health is:

Answers currently coded as : 
   - 1: Excellent
   - 2: Very Good
   - 3: Good
   - 4: Fair
   - 5: Poor
   - 7: Don't know/Not Sure
   - 9: Refused
   - BLANK: Not asked or Missing
   
Recode to: 
   - Nan: Don't know/Not Sure
   - Nan: Refused
   - BLANK: Not asked or Missing 
   - 1: Poor
   - 2: Fair
   - 3: Good
   - 4: Very Good
   - 5: Excellent
   

In [16]:
#print out original gen health values for reference and check data type
health_behaviour_df['GENHLTH'].head(20)

0     3.0
1     4.0
2     2.0
3     3.0
4     3.0
5     3.0
6     1.0
7     3.0
8     4.0
9     2.0
10    3.0
11    2.0
12    3.0
13    3.0
14    3.0
15    3.0
16    1.0
17    1.0
18    4.0
19    5.0
Name: GENHLTH, dtype: object

In [17]:
#recode GENHLTH values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 1., "GENHLTH"] = 5
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 2., "GENHLTH"] = 4
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 3., "GENHLTH"] = 3
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 4., "GENHLTH"] = 2
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 5., "GENHLTH"] = 1
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 7., "GENHLTH"] = np.NaN
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 9., "GENHLTH"] = np.NaN

In [18]:
recoded_health_behaviour_df.GENHLTH = pd.to_numeric(recoded_health_behaviour_df.GENHLTH).astype('Int32')

In [19]:
#print to confirm values were recoded correctly
recoded_health_behaviour_df['GENHLTH'].head(20)

0     3
1     2
2     4
3     3
4     3
5     3
6     5
7     3
8     2
9     4
10    3
11    4
12    3
13    3
14    3
15    3
16    5
17    5
18    2
19    1
Name: GENHLTH, dtype: Int32

In [20]:
print(recoded_health_behaviour_df.isnull().sum())

id               0
_STATE           0
GENHLTH       1047
PHYSHLTH        32
MENTHLTH        19
POORHLTH    188931
EXRACT11    130359
PAFREQ1_    140516
_MINAC11    146930
ACTIN12_    135951
EXRACT21    134658
PAFREQ2_    237530
_MINAC21    147892
ACTIN22_    143575
STRFREQ_     37082
PA2MIN_     140384
_METSTAT      8458
_URBSTAT      8458
HTIN4        20355
WTKG3        29748
_RFBMI5          0
_RFSMOK3         0
_RFDRHV7         0
FC601_           0
FTJUDA2_     38440
GRENDA1_     38902
VEGEDA2_     43369
POTADA1_     42224
FRNCHDA_     38866
_FRUTSU1     44600
_VEGESU1     53430
_PAINDX2         0
_PASTRNG         0
_AGE80           0
dtype: int64


### PA2MIN_
#### Question:  Minutes of total Physical Activity per week
Originally coded as: 
- 0-99999: Minutes of Activity per week
- BLANK: not asked or missing
    

In [21]:
#checking column values and check data type
recoded_health_behaviour_df['PA2MIN_'].head(20)

0     1260.0
1       56.0
2      210.0
3      200.0
4      360.0
5      330.0
6       68.0
7        NaN
8      270.0
9       90.0
10      60.0
11       NaN
12     140.0
13     120.0
14     420.0
15     120.0
16     840.0
17       NaN
18      45.0
19       NaN
Name: PA2MIN_, dtype: object

In [22]:
recoded_health_behaviour_df.PA2MIN_ = pd.to_numeric(recoded_health_behaviour_df.PA2MIN_).astype('Int32')
recoded_health_behaviour_df['PA2MIN_'].dtypes

Int32Dtype()

In [23]:
recoded_health_behaviour_df['PA2MIN_'].head(20)

0     1260
1       56
2      210
3      200
4      360
5      330
6       68
7     <NA>
8      270
9       90
10      60
11    <NA>
12     140
13     120
14     420
15     120
16     840
17    <NA>
18      45
19    <NA>
Name: PA2MIN_, dtype: Int32

In [24]:
recoded_health_behaviour_df['PA2MIN_'].unique()

<IntegerArray>
[1260,   56,  210,  200,  360,  330,   68, <NA>,  270,   90,
 ...
 5172, 8715, 7740, 6363, 3112, 4790, 2645, 5096, 5952, 2779]
Length: 3460, dtype: Int32

### PHYSHLTH
#### Question:  Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?
Answers originally coded as: 
- 1-30: Number of Days, numeric
- 88: None
- 77: Don't know/Not sure
- 99: Refused
- Blank: Not asked or missing

Recode to:

- 1-30: Number of Days, int
- 0: None
- Nan: Don't know/Not sure
- Nan: Refused
- Blank: Not asked or missing


In [25]:
#print column values for reference and check data type
recoded_health_behaviour_df['PHYSHLTH'].head(20)

0     88.0
1      2.0
2      3.0
3     88.0
4     88.0
5     88.0
6     88.0
7      2.0
8     20.0
9     88.0
10     1.0
11    88.0
12    88.0
13    88.0
14    88.0
15    88.0
16    88.0
17    88.0
18     7.0
19    30.0
Name: PHYSHLTH, dtype: object

In [26]:
# recode PHYSHLTH values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df.PHYSHLTH == 88, "PHYSHLTH"] = 0
recoded_health_behaviour_df.loc[health_behaviour_df.PHYSHLTH == 77, "PHYSHLTH"] = np.NaN
recoded_health_behaviour_df.loc[health_behaviour_df.PHYSHLTH == 99, "PHYSHLTH"] = np.NaN

In [27]:
#convert column from object to int
recoded_health_behaviour_df.PHYSHLTH = pd.to_numeric(recoded_health_behaviour_df.PHYSHLTH).astype('Int32')

In [28]:
#reprint values to confirm they were correctly recoded
recoded_health_behaviour_df['PHYSHLTH'].head(20)

0      0
1      2
2      3
3      0
4      0
5      0
6      0
7      2
8     20
9      0
10     1
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     7
19    30
Name: PHYSHLTH, dtype: Int32

### MENTHLTH
#### Question:  Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?
Answers originally coded as: 
- 1-30: Number of Days, numeric
- 88: None
- 77: Don't know/Not sure
- 99: Refused
- Blank: Not asked or missing

Recode to:

- 1-30: Number of Days, int
- 0: None
- Nan: Don't know/Not sure
- Nan: Refused
- Blank: Not asked or missing


In [29]:
#print column values for reference and check data type
recoded_health_behaviour_df['MENTHLTH'].head(20)

0     88.0
1      5.0
2     10.0
3      5.0
4     30.0
5     88.0
6     88.0
7     88.0
8      5.0
9     29.0
10     1.0
11    88.0
12    88.0
13    21.0
14    88.0
15    88.0
16    88.0
17     3.0
18    10.0
19     5.0
Name: MENTHLTH, dtype: object

In [30]:
# recode PHYSHLTH values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df.MENTHLTH == 88, "MENTHLTH"] = 0
recoded_health_behaviour_df.loc[health_behaviour_df.MENTHLTH == 77, "MENTHLTH"] = np.NaN
recoded_health_behaviour_df.loc[health_behaviour_df.MENTHLTH == 99, "MENTHLTH"] = np.NaN

In [31]:
#convert column from object to int
recoded_health_behaviour_df.MENTHLTH = pd.to_numeric(recoded_health_behaviour_df.MENTHLTH).astype('Int32')

In [32]:
recoded_health_behaviour_df['MENTHLTH'].dtypes

Int32Dtype()

In [33]:
#reprint values to confirm they were correctly recoded
recoded_health_behaviour_df['MENTHLTH'].head(20)

0      0
1      5
2     10
3      5
4     30
5      0
6      0
7      0
8      5
9     29
10     1
11     0
12     0
13    21
14     0
15     0
16     0
17     3
18    10
19     5
Name: MENTHLTH, dtype: Int32

In [34]:
recoded_health_behaviour_df['MENTHLTH'].unique()

<IntegerArray>
[   0,    5,   10,   30,   29,    1,   21,    3,    4,    2,    7,   15,   20,
   12,   14,    6,   16, <NA>,   25,   28,    8,    9,   22,   18,   11,   27,
   17,   24,   19,   26,   23,   13]
Length: 32, dtype: Int32

### _PAINDX2
#### Question:  Physical Activity Index
Values originally coded as:
- 1: Meet Aerobic Recommendations
- 2: Did Not Meet Aerobic Recommendations
- 9: Don’t know/Not Sure/Refused/Missing

Recode to:
- 1: Meet Aerobic Recommendations
- 2: Did Not Meet Aerobic Recommendations
- Nan: Don’t know/Not Sure/Refused/Missing
    

In [35]:
recoded_health_behaviour_df['_PAINDX2'].head(20)

0     1.0
1     2.0
2     1.0
3     1.0
4     1.0
5     1.0
6     2.0
7     9.0
8     1.0
9     2.0
10    2.0
11    2.0
12    2.0
13    2.0
14    1.0
15    2.0
16    1.0
17    9.0
18    2.0
19    2.0
Name: _PAINDX2, dtype: object

In [36]:
# recode values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df._PAINDX2 == 9, "_PAINDX2"] = np.NaN
recoded_health_behaviour_df._PAINDX2 = pd.to_numeric(recoded_health_behaviour_df._PAINDX2).astype('Int32')

In [37]:
recoded_health_behaviour_df['_PAINDX2'].unique()

<IntegerArray>
[1, 2, <NA>]
Length: 3, dtype: Int32

### _PASTRNG
#### Question:  Muscle Strengthening Recommendation
Originally coded as:
- 1: Meet muscle strengthening recommendations
- 2: Did not meet muscle strengthening recommendations
- 9: Don’t know/Not Sure/Refused/Missing

Recode to:
- 1: Meet muscle strengthening recommendations
- 2: Did not meet muscle strengthening recommendations
- Nan: Don’t know/Not Sure/Refused/Missing

In [38]:
recoded_health_behaviour_df['_PASTRNG'].unique()

array([Decimal('2.0'), Decimal('1.0'), Decimal('9.0')], dtype=object)

In [39]:
# recode values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df._PASTRNG == 9, "_PASTRNG"] = np.NaN
recoded_health_behaviour_df._PASTRNG = pd.to_numeric(recoded_health_behaviour_df._PASTRNG).astype('Int32')

In [40]:
recoded_health_behaviour_df['_PASTRNG'].unique()

<IntegerArray>
[2, 1, <NA>]
Length: 3, dtype: Int32

### _FRUTSU1
#### Question:  Total fruits consumed per day
Originally coded as: 
- 0-99998: Number of Fruits consumed per day (two implied decimal places)
- BLANK: Not asked or Missing

Recode to:
- 0.00-999.98: Number of Fruits consumed per day
- Nan: Not asked or Missing

In [41]:
recoded_health_behaviour_df['_FRUTSU1'].head(20)

0       NaN
1     114.0
2      10.0
3      46.0
4     100.0
5     329.0
6     114.0
7       NaN
8     100.0
9     203.0
10     86.0
11      NaN
12     43.0
13      NaN
14    200.0
15     57.0
16    214.0
17      NaN
18     45.0
19     13.0
Name: _FRUTSU1, dtype: object

In [42]:
#TODO: convert to int to remove decimal place, convert to string then interate through and insert decimal before last two digits, convert to float

In [43]:
recoded_health_behaviour_df['_FRUTSU1'] = recoded_health_behaviour_df['_FRUTSU1'].fillna(100000).astype(float)

In [44]:
recoded_health_behaviour_df['_FRUTSU1'].head(20)

0     100000.0
1        114.0
2         10.0
3         46.0
4        100.0
5        329.0
6        114.0
7     100000.0
8        100.0
9        203.0
10        86.0
11    100000.0
12        43.0
13    100000.0
14       200.0
15        57.0
16       214.0
17    100000.0
18        45.0
19        13.0
Name: _FRUTSU1, dtype: float64

In [45]:
recoded_health_behaviour_df['_FRUTSU1'] = recoded_health_behaviour_df['_FRUTSU1'].div(100).round(2)

In [46]:
recoded_health_behaviour_df['_FRUTSU1'].head(20)

0     1000.00
1        1.14
2        0.10
3        0.46
4        1.00
5        3.29
6        1.14
7     1000.00
8        1.00
9        2.03
10       0.86
11    1000.00
12       0.43
13    1000.00
14       2.00
15       0.57
16       2.14
17    1000.00
18       0.45
19       0.13
Name: _FRUTSU1, dtype: float64

In [47]:
recoded_health_behaviour_df['_FRUTSU1'] = recoded_health_behaviour_df['_FRUTSU1'].mask(np.isclose(recoded_health_behaviour_df['_FRUTSU1'].values, 1000.00))

In [48]:
recoded_health_behaviour_df['_FRUTSU1'].head(20)

0      NaN
1     1.14
2     0.10
3     0.46
4     1.00
5     3.29
6     1.14
7      NaN
8     1.00
9     2.03
10    0.86
11     NaN
12    0.43
13     NaN
14    2.00
15    0.57
16    2.14
17     NaN
18    0.45
19    0.13
Name: _FRUTSU1, dtype: float64

###  _VEGESU1
#### Question:  Total vegetables consumed per day
Originally coded as: 
- 0-99998: Number of Vegetables consumed per day (two implied decimal places)
- BLANK: Not asked or Missing

Recode to:
- 0.00-999.98: Number of Vegetables consumed per day
- Nan: Not asked or Missing

In [49]:
recoded_health_behaviour_df['_VEGESU1'].head(20)

0       NaN
1      89.0
2     179.0
3     113.0
4      89.0
5      82.0
6     235.0
7       NaN
8      75.0
9     135.0
10    100.0
11     61.0
12    129.0
13      NaN
14    200.0
15      NaN
16    310.0
17      NaN
18     46.0
19    414.0
Name: _VEGESU1, dtype: object

In [50]:
recoded_health_behaviour_df['_VEGESU1'] = recoded_health_behaviour_df['_VEGESU1'].fillna(100000).astype(float)
recoded_health_behaviour_df['_VEGESU1'] = recoded_health_behaviour_df['_VEGESU1'].div(100).round(2)
recoded_health_behaviour_df['_VEGESU1'] = recoded_health_behaviour_df['_VEGESU1'].mask(np.isclose(recoded_health_behaviour_df['_VEGESU1'].values, 1000.00))
recoded_health_behaviour_df['_VEGESU1'].head(20)

0      NaN
1     0.89
2     1.79
3     1.13
4     0.89
5     0.82
6     2.35
7      NaN
8     0.75
9     1.35
10    1.00
11    0.61
12    1.29
13     NaN
14    2.00
15     NaN
16    3.10
17     NaN
18    0.46
19    4.14
Name: _VEGESU1, dtype: float64

In [51]:
#recode states to two letter codes so they can work with go.choropleth

recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 1, "_STATE"] = 'AL'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 2, "_STATE"] = 'AK'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 4, "_STATE"] = 'AZ'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 5, "_STATE"] = 'AR'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 6, "_STATE"] = 'CA'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 8, "_STATE"] = 'CO'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 9, "_STATE"] = 'CT'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 10, "_STATE"] = 'DE'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 11, "_STATE"] = 'DC'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 12, "_STATE"] = 'FL'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 13, "_STATE"] = 'GA'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 15, "_STATE"] = 'HI'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 16, "_STATE"] = 'ID'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 17, "_STATE"] = 'IL'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 18, "_STATE"] = 'IN'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 19, "_STATE"] = 'IA'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 20, "_STATE"] = 'KS'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 21, "_STATE"] = 'KY'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 22, "_STATE"] = 'LA'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 23, "_STATE"] = 'ME'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 24, "_STATE"] = 'MD'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 25, "_STATE"] = 'MA'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 26, "_STATE"] = 'MI'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 27, "_STATE"] = 'MN'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 28, "_STATE"] = 'MS'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 29, "_STATE"] = 'MO'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 30, "_STATE"] = 'MT'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 31, "_STATE"] = 'NE'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 32, "_STATE"] = 'NV'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 33, "_STATE"] = 'NH'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 35, "_STATE"] = 'NM'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 36, "_STATE"] = 'NY'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 37, "_STATE"] = 'NC'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 38, "_STATE"] = 'ND'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 39, "_STATE"] = 'OH'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 40, "_STATE"] = 'OK'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 41, "_STATE"] = 'OR'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 42, "_STATE"] = 'PA'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 44, "_STATE"] = 'RI'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 45, "_STATE"] = 'SC'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 46, "_STATE"] = 'SD'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 47, "_STATE"] = 'TN'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 48, "_STATE"] = 'TX'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 49, "_STATE"] = 'UT'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 50, "_STATE"] = 'VT'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 51, "_STATE"] = 'VA'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 53, "_STATE"] = 'WA'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 54, "_STATE"] = 'WV'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 55, "_STATE"] = 'WI'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 56, "_STATE"] = 'WY'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 66, "_STATE"] = 'GU'
recoded_health_behaviour_df.loc[recoded_health_behaviour_df._STATE == 72, "_STATE"] = 'PR'

In [52]:
recoded_health_behaviour_df

Unnamed: 0,id,_STATE,GENHLTH,PHYSHLTH,MENTHLTH,POORHLTH,EXRACT11,PAFREQ1_,_MINAC11,ACTIN12_,...,FTJUDA2_,GRENDA1_,VEGEDA2_,POTADA1_,FRNCHDA_,_FRUTSU1,_VEGESU1,_PAINDX2,_PASTRNG,_AGE80
0,31580,CA,3,0,0,,64.0,7000.0,1260.0,1.0,...,,,,,,,,1,2,46.0
1,31581,CA,2,2,5,88.0,18.0,467.0,28.0,2.0,...,43.0,43.0,29.0,3.0,14.0,1.14,0.89,2,2,57.0
2,31582,CA,4,3,10,10.0,64.0,6000.0,210.0,1.0,...,0.0,33.0,100.0,13.0,33.0,0.10,1.79,1,2,58.0
3,31583,CA,3,0,5,88.0,64.0,5000.0,200.0,1.0,...,17.0,10.0,71.0,3.0,29.0,0.46,1.13,1,2,35.0
4,31584,CA,3,0,30,15.0,64.0,4000.0,120.0,1.0,...,57.0,3.0,43.0,14.0,29.0,1.00,0.89,1,1,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418263,31575,CA,5,0,0,,,,,,...,100.0,,,,,4.00,,2,2,27.0
418264,31576,CA,1,30,30,88.0,64.0,933.0,56.0,1.0,...,0.0,100.0,43.0,3.0,0.0,0.29,1.46,2,2,66.0
418265,31577,CA,1,30,0,88.0,,,,,...,29.0,43.0,43.0,3.0,3.0,1.29,0.92,2,2,54.0
418266,31578,CA,3,0,0,,,,,,...,,,,,,,,2,1,56.0


## TODO: get mean values per state for chloropleth maps

In [53]:
state_veg_mean = recoded_health_behaviour_df.groupby('_STATE')['_VEGESU1'].mean().reset_index()

### State Codes
original code: 
- 1: Alabama
- 2	Alaska	
- 4	Arizona	
- 5	Arkansas	
- 6	California	
- 8	Colorado	
- 9	Connecticut	
- 10	Delaware	
- 11	District of Columbia	
- 12	Florida	
- 13	Georgia	
- 15	Hawaii	
- 16	Idaho	
- 17	Illinois	
- 18	Indiana	
- 19	Iowa	
- 20	Kansas	
- 21	Kentucky	
- 22	Louisiana	
- 23	Maine	
- 24	Maryland	
- 25	Massachusetts	
- 26	Michigan	
- 27	Minnesota	
- 28	Mississippi	
- 29	Missouri	
- 30	Montana	
- 31	Nebraska	
- 32	Nevada	
- 33	New Hampshire	
- 35	New Mexico	
- 36	New York	
- 37	North Carolina	
- 38	North Dakota	
- 39	Ohio	
- 40	Oklahoma	
- 41	Oregon	
- 42	Pennsylvania	
- 44	Rhode Island	
- 45	South Carolina	
- 46	South Dakota	
- 47	Tennessee	
- 48	Texas	
- 49	Utah	
- 50	Vermont	
- 51	Virginia	
- 53	Washington	
- 54	West Virginia	
- 55	Wisconsin	
- 56	Wyoming
- 66	Guam	
- 72	Puerto Rico
    

In [54]:
state_veg_mean.head()

Unnamed: 0,_STATE,_VEGESU1
0,AK,2.332528
1,AL,1.74413
2,AR,2.422322
3,AZ,2.650538
4,CA,1.825288


In [55]:
#recode states to two letter codes so they can work with go.choropleth

state_veg_mean.loc[state_veg_mean._STATE == 1, "_STATE"] = 'AL'
state_veg_mean.loc[state_veg_mean._STATE == 2, "_STATE"] = 'AK'
state_veg_mean.loc[state_veg_mean._STATE == 4, "_STATE"] = 'AZ'
state_veg_mean.loc[state_veg_mean._STATE == 5, "_STATE"] = 'AR'
state_veg_mean.loc[state_veg_mean._STATE == 6, "_STATE"] = 'CA'
state_veg_mean.loc[state_veg_mean._STATE == 8, "_STATE"] = 'CO'
state_veg_mean.loc[state_veg_mean._STATE == 9, "_STATE"] = 'CT'
state_veg_mean.loc[state_veg_mean._STATE == 10, "_STATE"] = 'DE'
state_veg_mean.loc[state_veg_mean._STATE == 11, "_STATE"] = 'DC'
state_veg_mean.loc[state_veg_mean._STATE == 12, "_STATE"] = 'FL'
state_veg_mean.loc[state_veg_mean._STATE == 13, "_STATE"] = 'GA'
state_veg_mean.loc[state_veg_mean._STATE == 15, "_STATE"] = 'HI'
state_veg_mean.loc[state_veg_mean._STATE == 16, "_STATE"] = 'ID'
state_veg_mean.loc[state_veg_mean._STATE == 17, "_STATE"] = 'IL'
state_veg_mean.loc[state_veg_mean._STATE == 18, "_STATE"] = 'IN'
state_veg_mean.loc[state_veg_mean._STATE == 19, "_STATE"] = 'IA'
state_veg_mean.loc[state_veg_mean._STATE == 20, "_STATE"] = 'KS'
state_veg_mean.loc[state_veg_mean._STATE == 21, "_STATE"] = 'KY'
state_veg_mean.loc[state_veg_mean._STATE == 22, "_STATE"] = 'LA'
state_veg_mean.loc[state_veg_mean._STATE == 23, "_STATE"] = 'ME'
state_veg_mean.loc[state_veg_mean._STATE == 24, "_STATE"] = 'MD'
state_veg_mean.loc[state_veg_mean._STATE == 25, "_STATE"] = 'MA'
state_veg_mean.loc[state_veg_mean._STATE == 26, "_STATE"] = 'MI'
state_veg_mean.loc[state_veg_mean._STATE == 27, "_STATE"] = 'MN'
state_veg_mean.loc[state_veg_mean._STATE == 28, "_STATE"] = 'MS'
state_veg_mean.loc[state_veg_mean._STATE == 29, "_STATE"] = 'MO'
state_veg_mean.loc[state_veg_mean._STATE == 30, "_STATE"] = 'MT'
state_veg_mean.loc[state_veg_mean._STATE == 31, "_STATE"] = 'NE'
state_veg_mean.loc[state_veg_mean._STATE == 32, "_STATE"] = 'NV'
state_veg_mean.loc[state_veg_mean._STATE == 33, "_STATE"] = 'NH'
state_veg_mean.loc[state_veg_mean._STATE == 35, "_STATE"] = 'NM'
state_veg_mean.loc[state_veg_mean._STATE == 36, "_STATE"] = 'NY'
state_veg_mean.loc[state_veg_mean._STATE == 37, "_STATE"] = 'NC'
state_veg_mean.loc[state_veg_mean._STATE == 38, "_STATE"] = 'ND'
state_veg_mean.loc[state_veg_mean._STATE == 39, "_STATE"] = 'OH'
state_veg_mean.loc[state_veg_mean._STATE == 40, "_STATE"] = 'OK'
state_veg_mean.loc[state_veg_mean._STATE == 41, "_STATE"] = 'OR'
state_veg_mean.loc[state_veg_mean._STATE == 42, "_STATE"] = 'PA'
state_veg_mean.loc[state_veg_mean._STATE == 44, "_STATE"] = 'RI'
state_veg_mean.loc[state_veg_mean._STATE == 45, "_STATE"] = 'SC'
state_veg_mean.loc[state_veg_mean._STATE == 46, "_STATE"] = 'SD'
state_veg_mean.loc[state_veg_mean._STATE == 47, "_STATE"] = 'TN'
state_veg_mean.loc[state_veg_mean._STATE == 48, "_STATE"] = 'TX'
state_veg_mean.loc[state_veg_mean._STATE == 49, "_STATE"] = 'UT'
state_veg_mean.loc[state_veg_mean._STATE == 50, "_STATE"] = 'VT'
state_veg_mean.loc[state_veg_mean._STATE == 51, "_STATE"] = 'VA'
state_veg_mean.loc[state_veg_mean._STATE == 53, "_STATE"] = 'WA'
state_veg_mean.loc[state_veg_mean._STATE == 54, "_STATE"] = 'WV'
state_veg_mean.loc[state_veg_mean._STATE == 55, "_STATE"] = 'WI'
state_veg_mean.loc[state_veg_mean._STATE == 56, "_STATE"] = 'WY'
state_veg_mean.loc[state_veg_mean._STATE == 66, "_STATE"] = 'GU'
state_veg_mean.loc[state_veg_mean._STATE == 72, "_STATE"] = 'PR'

In [56]:
state_veg_mean

Unnamed: 0,_STATE,_VEGESU1
0,AK,2.332528
1,AL,1.74413
2,AR,2.422322
3,AZ,2.650538
4,CA,1.825288
5,CO,1.876607
6,CT,2.916833
7,DC,2.085319
8,DE,1.86482
9,FL,1.984152


In [57]:
##TODO: test on larger dataset

### Health Behaviours Chloropleth maps by state

In [111]:
import plotly.graph_objects as go

behaviour_fig = go.Figure(data=go.Choropleth(
    locations=state_veg_mean['_STATE'],
    z=state_veg_mean['_VEGESU1'].astype(float),
    locationmode='USA-states',
    colorscale='tempo',
    autocolorscale=False,
    text=state_veg_mean['_STATE'], # hover text
    marker_line_color='white', # line markers between states
    colorbar_title="Vegetables Eaten per Day"
))

behaviour_fig.update_layout(
    title_text='Average Number of Vegetables Eaten a Day<br>(Hover for breakdown)',
    geo = dict(
        scope='usa',
        projection=go.layout.geo.Projection(type = 'albers usa'),
        showlakes=True, # lakes
        lakecolor='rgb(255, 255, 255)'),
)

behaviour_fig.show()

In [59]:
### Fruits per day by state

In [60]:
state_fruit_mean = recoded_health_behaviour_df.groupby('_STATE')['_FRUTSU1'].mean().reset_index()

In [61]:
state_fruit_mean.head(20)


Unnamed: 0,_STATE,_FRUTSU1
0,AK,1.540369
1,AL,1.232993
2,AR,1.569661
3,AZ,1.748288
4,CA,1.435239
5,CO,1.396317
6,CT,1.946547
7,DC,1.532979
8,DE,1.426221
9,FL,1.394397


In [62]:
fig = go.Figure(data=go.Choropleth(
    locations=state_fruit_mean['_STATE'],
    z=state_fruit_mean['_FRUTSU1'].astype(float),
    locationmode='USA-states',
    colorscale='purp',
    autocolorscale=False,
    text=state_fruit_mean['_STATE'], # hover text
    marker_line_color='white', # line markers between states
    colorbar_title="Fruits Eaten per Day"
))

fig.update_layout(
    title_text='Average Number of Fruits Eaten a Day<br>(Hover for breakdown)',
    geo = dict(
        scope='usa',
        projection=go.layout.geo.Projection(type = 'albers usa'),
        showlakes=True, # lakes
        lakecolor='rgb(255, 255, 255)'),
)

fig.show()

In [63]:
## 

In [64]:
### % of state that meets physical activity reccomendations
#TODO: group by state, find % of '1': meets reccomendations
state_act_mean_df = recoded_health_behaviour_df[['_STATE', '_PAINDX2']].dropna()
state_act_mean_df.head(20)

Unnamed: 0,_STATE,_PAINDX2
0,CA,1
1,CA,2
2,CA,1
3,CA,1
4,CA,1
5,CA,1
6,CA,2
8,CA,1
9,CA,2
10,CA,2


In [65]:
#group by _PAINDX2 and calculate sum of '1' within each state

def calc_perc(x): 
    return 100 * x / float(x.sum())

state_act_mean = state_act_mean_df.groupby(['_STATE', '_PAINDX2' ], as_index=False).agg({'_PAINDX2': 'sum'})
act_perc_df = state_act_mean.groupby('_STATE').apply(calc_perc)
#add back in _STATE column fore reference
act_perc_df['_STATE'] = state_act_mean['_STATE']
state_act_mean.head()

Unnamed: 0,_STATE,_PAINDX2
0,AK,1642
1,AK,2078
2,AL,2885
3,AL,6862
4,AR,2277


In [66]:
act_perc_df.head(20)

Unnamed: 0,_PAINDX2,_STATE
0,44.139785,AK
1,55.860215,AK
2,29.598851,AL
3,70.401149,AL
4,31.196054,AR
5,68.803946,AR
6,40.685358,AZ
7,59.314642,AZ
8,38.18882,CA
9,61.81118,CA


In [67]:
#Groupy organized states by _PAINDX2 1 sum then 2 sum so remove even numbers dataframe rows to keep only value representing percentage of state that did meet reccomended physical activity
pass_state_phys_act = act_perc_df[np.arange(len(act_perc_df)) % 2 == 0]
pass_state_phys_act.head(20)


Unnamed: 0,_PAINDX2,_STATE
0,44.139785,AK
2,29.598851,AL
4,31.196054,AR
6,40.685358,AZ
8,38.18882,CA
10,43.527172,CO
12,40.344201,CT
14,40.514776,DC
16,34.872798,DE
18,36.988839,FL


In [68]:
fig = go.Figure(data=go.Choropleth(
    locations=pass_state_phys_act['_STATE'],
    z=pass_state_phys_act['_PAINDX2'].astype(float),
    locationmode='USA-states',
    colorscale='burg',
    autocolorscale=False,
    text=state_fruit_mean['_STATE'], # hover text
    marker_line_color='white', # line markers between states
    colorbar_title="% Meeting Recommended Activity"
))

fig.update_layout(
    title_text='Percentage of State that Met Recommended Amount of Physical Activity in 2019<br>(Hover for breakdown)',
    geo = dict(
        scope='usa',
        projection=go.layout.geo.Projection(type = 'albers usa'),
        showlakes=True, # lakes
        lakecolor='rgb(255, 255, 255)'),
)

fig.show()

## TODO: 
_RFDRHV7:
Question:  Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week)

Coded as:
- 1: No
- 2: Yes
- 9: Don't know

Recode to:
- 1: No
- 2: Yes
- Nan: Don't know


In [69]:
#recode to new scheme
recoded_health_behaviour_df.loc[health_behaviour_df._RFDRHV7 == 9, "_RFDRHV7"] = np.NaN
recoded_health_behaviour_df['_RFDRHV7'].unique()

array([Decimal('1.0'), Decimal('2.0'), nan], dtype=object)

In [70]:
### % of state that is heavy drinkers
#TODO: group by state, find % of '2': heavy drinker
state_drink_heavy_df = recoded_health_behaviour_df[['_STATE', '_RFDRHV7']].dropna()
state_drink_heavy_df._RFDRHV7 = pd.to_numeric(state_drink_heavy_df._RFDRHV7).astype('Int32')
state_drink_heavy_df.head(20)

Unnamed: 0,_STATE,_RFDRHV7
0,CA,1
1,CA,1
2,CA,1
3,CA,2
4,CA,1
5,CA,1
6,CA,1
8,CA,1
9,CA,1
10,CA,1


In [71]:
#group by _RFDRHV7 and calculate sum of '2' within each state

def calc_perc(x): 
    return 100 * x / float(x.sum())

state_drink = state_drink_heavy_df.groupby(['_STATE', '_RFDRHV7' ], as_index=False).agg({'_RFDRHV7': 'sum'})
drink_perc_df = state_drink.groupby('_STATE').apply(calc_perc)
#add back in _STATE column fore reference
drink_perc_df['_STATE'] = state_drink['_STATE']
drink_perc_df.head()

Unnamed: 0,_RFDRHV7,_STATE
0,83.685601,AK
1,16.314399,AK
2,90.583804,AL
3,9.416196,AL
4,91.579348,AR


In [72]:
#Groupy organized states by _RFDRHV7 1 sum then 2 sum so remove odd numbers dataframe rows to keep only value representing percentage of state that did meet reccomended physical activity
drink_perc_df = drink_perc_df[np.arange(len(drink_perc_df)) % 2 != 0]
drink_perc_df['_RFDRHV7'].head(20)

1     16.314399
3      9.416196
5      8.420652
7     11.080711
9     11.477863
11    11.503643
13    11.354294
15    13.768404
17    11.552535
19    13.031058
21    10.236109
23    12.940213
25    15.982884
27    11.350254
29    11.539875
31    12.788632
33     9.467593
35    10.241667
37     8.886108
39    13.744076
Name: _RFDRHV7, dtype: Float64

In [73]:
fig = go.Figure(data=go.Choropleth(
    locations=drink_perc_df['_STATE'],
    z=drink_perc_df['_RFDRHV7'].astype(float),
    locationmode='USA-states',
    colorscale='blues',
    autocolorscale=False,
    text=state_fruit_mean['_STATE'], # hover text
    marker_line_color='white', # line markers between states
    colorbar_title="% of Heavy Drinkers"
))

fig.update_layout(
    title_text='Percentage of State Considered Heavy Drinkers<br>adult men having more than 14 drinks per week and adult women having more than 7 drinks per week<br>(Hover for breakdown)',
    geo = dict(
        scope='usa',
        projection=go.layout.geo.Projection(type = 'albers usa'),
        showlakes=True, # lakes
        lakecolor='rgb(255, 255, 255)'),
)

fig.show()

## _RFBMI5
Question:  Adults who have a body mass index greater than 25.00 (Overweight or Obese)
Coded as:
- 1: No
- 2: Yes
- 9: Don't Know

Recoded as:
- 1: No
- 2: Yes
- Nan: Don't Know

In [74]:
#recode to new scheme
recoded_health_behaviour_df.loc[health_behaviour_df._RFBMI5 == 9, "_RFBMI5"] = np.NaN
recoded_health_behaviour_df['_RFBMI5'].unique()

array([Decimal('2.0'), Decimal('1.0'), nan], dtype=object)

In [75]:
### % of state that is overwheight or obese
#TODO: group by state, find % of '2': heavy drinker
state_heavy_df = recoded_health_behaviour_df[['_STATE', '_RFBMI5']].dropna()
state_heavy_df._RFBMI5 = pd.to_numeric(state_heavy_df._RFBMI5).astype('Int32')
state_heavy_df.head(20)

Unnamed: 0,_STATE,_RFBMI5
0,CA,2
1,CA,2
2,CA,2
3,CA,2
4,CA,2
5,CA,2
6,CA,1
8,CA,2
9,CA,1
10,CA,2


In [76]:
#group by _RFBMI5 and calculate sum of '2' within each state

def calc_perc(x): 
    return 100 * x / float(x.sum())

state_bmi = state_heavy_df.groupby(['_STATE', '_RFBMI5' ], as_index=False).agg({'_RFBMI5': 'sum'})
bmi_perc_df = state_bmi.groupby('_STATE').apply(calc_perc)
#add back in _STATE column for reference
bmi_perc_df['_STATE'] = state_bmi['_STATE']
bmi_perc_df.head(30)

Unnamed: 0,_RFBMI5,_STATE
0,19.824369,AK
1,80.175631,AK
2,16.926663,AL
3,83.073337,AL
4,17.99113,AR
5,82.00887,AR
6,20.568734,AZ
7,79.431266,AZ
8,21.855682,CA
9,78.144318,CA


In [77]:
bmi_perc_df = bmi_perc_df[np.arange(len(bmi_perc_df)) % 2 != 0]
bmi_perc_df.head(20)

Unnamed: 0,_RFBMI5,_STATE
1,80.175631,AK
3,83.073337,AL
5,82.00887,AR
7,79.431266,AZ
9,78.144318,CA
11,75.365871,CO
13,79.457053,CT
15,75.975897,DC
17,82.825151,DE
19,80.06358,FL


In [126]:

fig = go.Figure(data=go.Choropleth(
    locations=bmi_perc_df['_STATE'],
    z=bmi_perc_df['_RFBMI5'].astype(float),
    locationmode='USA-states',
    colorscale='tempo',
    autocolorscale=False,
    text=state_fruit_mean['_STATE'], # hover text
    marker_line_color='white', # line markers between states
    colorbar_title="% of State Overweight or Obese"
))

fig.update_layout(
    title_text='Percentage of State Considered Overweight or Obese <br> Adults who have a body mass index greater than 25.00 <br>(Hover for breakdown)',
    geo = dict(
        scope='usa',
        projection=go.layout.geo.Projection(type = 'albers usa'),
        showlakes=True, # lakes
        lakecolor='rgb(255, 255, 255)'),
)

fig.show()

In [163]:
##Make health behaviours df with columns for each
#Including, daily veg, daily fruit, daily fries, smoking, smoking, alcohol, phys activity, strength
hb_df = state_veg_mean.copy() #Daily Veg
hb_df['_FRUTSU1'] = state_fruit_mean['_FRUTSU1'].values # Daily Fruits
#daily fries
hb_df['_RFDRHV7'] = drink_perc_df['_RFDRHV7'].values #Alcohol
#smoking
hb_df['_PAINDX2'] = pass_state_phys_act['_PAINDX2'].values #Aerobic
#Strength

hb_df.head(20)

Unnamed: 0,_STATE,_VEGESU1,_FRUTSU1,_RFDRHV7,_PAINDX2
0,AK,2.332528,1.540369,16.314399,44.139785
1,AL,1.74413,1.232993,9.416196,29.598851
2,AR,2.422322,1.569661,8.420652,31.196054
3,AZ,2.650538,1.748288,11.080711,40.685358
4,CA,1.825288,1.435239,11.477863,38.18882
5,CO,1.876607,1.396317,11.503643,43.527172
6,CT,2.916833,1.946547,11.354294,40.344201
7,DC,2.085319,1.532979,13.768404,40.514776
8,DE,1.86482,1.426221,11.552535,34.872798
9,FL,1.984152,1.394397,13.031058,36.988839


In [164]:
behaviour_fig = go.Figure(data=go.Choropleth(
    locations=hb_df['_STATE'],
    z=hb_df['_VEGESU1'].astype(float),
    locationmode='USA-states',
    colorscale='tempo',
    autocolorscale=False,
    text=hb_df['_STATE'], # hover text
    marker_line_color='white', # line markers between states
    #colorbar_title="Vegetables Eaten per Day"
))

behaviour_fig.update_layout(
    title_text='Average Number of Vegetables Eaten a Day<br>(Hover for breakdown)',
    geo = dict(
        scope='usa',
        projection=go.layout.geo.Projection(type = 'albers usa'),
        showlakes=True, # lakes
        lakecolor='rgb(255, 255, 255)'),
)

behaviour_fig.show()

In [165]:
##HEALTH BEHAVIOUR DROP DOWN CHOROPLETH
#Including, daily veg, daily fruit, daily fries, alcohol, smoking, phys activity, strength
button1 = dict(method= 'update',
               label='Daily Veg',
               args=[
                    {"locations":[hb_df['_STATE']],
                     "z": [hb_df['_VEGESU1']],
                    "text":[hb_df['_STATE']], # hover text
                    "marker_line_color":['white'], # line markers between states
                    "colorbar_title": ["Vegetables per Day"]
                                    }, {'title.text' : 'Average Number of Vegetables Eaten per Day'}
                                 ])

button2 = dict(method= 'update',
               label='Daily Fruit',
               args=[
                    {"locations":[hb_df['_STATE']],
                     "z": [hb_df['_FRUTSU1']],
                    "text":[hb_df['_STATE']], # hover text
                    "marker_line_color":['white'], # line markers between states
                    "colorbar_title": ["Fruits per Day"]
                                    },  {'title.text' : 'Average Number of Fruits Eaten per Day'}
                                 ])
#button3 = dict(method= 'update',
#               label='Daily Fries',
#               args=[
#                    {"locations":[hb_df['_STATE']],
#                     "z": [hb_df['??']],
#                    "text":[hb_df['_STATE']], # hover text
#                    "marker_line_color":['white'], # line markers between states
#                    "colorbar_title": ["Servings of Fries per Day"]
#                                    },  {'title.text' : 'Average Number of Times French Fries are Eaten per Day'}
#                                 ])

button4 = dict(method= 'update',
               label='Alcohol',
               args=[
                    {"locations":[hb_df['_STATE']],
                     "z": [hb_df['_RFDRHV7']],
                    "text":[hb_df['_STATE']], # hover text
                    "marker_line_color":['white'], # line markers between states
                    "colorbar_title": ["% of Heavy Drinkers"]
                                    },  {'title.text' : 'Percentage of State Population Considered Heavy Drinkers<br>adult men having more than 14 drinks per week and adult women having more than 7 drinks per week<br>(Hover for breakdown)'}
                                 ])

#button5 = dict(method= 'update',
#               label='Smoking',
#               args=[
#                    {"locations":[hb_df['_STATE']],
#                     "z": [hb_df['??']],
#                    "text":[hb_df['_STATE']], # hover text
#                    "marker_line_color":['white'], # line markers between states
#                    "colorbar_title": ["% of Current Smokers"]
#                                   },  {'title.text' : 'Percentage of State Population Currently Smoking Cigarettes<br>adult men having more than 14 drinks per week and adult women having more than 7 drinks per week<br>(Hover for breakdown)'}
#                                 ])

button6 = dict(method= 'update',
               label='Aerobic Exc',
               args=[
                    {"locations":[hb_df['_STATE']],
                     "z": [hb_df['_PAINDX2']],
                    "text":[hb_df['_STATE']], # hover text
                    "marker_line_color":['white'], # line markers between states
                    "colorbar_title": ["% Meeting Recommendations"]
                                    },  {'title.text' : 'Percentage of State Population Meeting Weeekly Aerobic Exercise Recommendations<br>(Hover for breakdown)'}
                                 ])

#button7 = dict(method= 'update',
#               label='Strength Exc',
#               args=[
#                    {"locations":[hb_df['_STATE']],
#                     "z": [hb_df['??']],
#                    "text":[hb_df['_STATE']], # hover text
#                    "marker_line_color":['white'], # line markers between states
#                    "colorbar_title": ["% Meeting Recommendations"]
#                                    },  {'title.text' : 'Percentage of State Population Meeting Weeekly Strength Exercise Recommendationsk<br>(Hover for breakdown)'}
#                                 ])


In [166]:
behaviour_fig.update_layout(updatemenus=[dict(active=0,
                                    buttons= [button1, button2, button4, button6])]
                                     );
behaviour_fig.show()

In [None]:
import chart_studio.plotly as py
import chart_studio
chart_studio.tools.set_config_file(world_readable=False,
                             sharing='private')
py.iplot(fig, filename='update-choroplethmapbox')

## _RFSMOK3
Question:  Adults who are current smokers
Coded as:
- 1: No
- 2: Yes
- 9: Don't Know

Recoded as:
- 1: No
- 2: Yes
- Nan: Don't Know

In [None]:
#recode to new scheme
recoded_health_behaviour_df.loc[health_behaviour_df._RFSMOK3 == 9, "_RFSMOK3"] = np.NaN
recoded_health_behaviour_df['_RFSMOK3'].unique()

In [None]:
### % of state that is overwheight or obese
#TODO: group by state, find % of '2': heavy drinker
state_smoke_df = recoded_health_behaviour_df[['_STATE', '_RFSMOK3']].dropna()
state_smoke_df._RFSMOK3 = pd.to_numeric(state_smoke_df._RFSMOK3).astype('Int32')
state_smoke_df.head(20)

In [None]:
#group by _RFBMI5 and calculate sum of '2' within each state

def calc_perc(x): 
    return 100 * x / float(x.sum())

state_smoke = state_smoke_df.groupby(['_STATE', '_RFSMOK3' ], as_index=False).agg({'_RFSMOK3': 'sum'})
smoke_perc_df = state_smoke.groupby('_STATE').apply(calc_perc)
#add back in _STATE column for reference
smoke_perc_df['_STATE'] = state_smoke['_STATE']
smoke_perc_df.head(30)

In [None]:
smoke_perc_df = smoke_perc_df[np.arange(len(smoke_perc_df)) % 2 != 0]
smoke_perc_df.head(20)

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=smoke_perc_df['_STATE'],
    z=smoke_perc_df['_RFSMOK3'].astype(float),
    locationmode='USA-states',
    colorscale='dense',
    autocolorscale=False,
    text=state_fruit_mean['_STATE'], # hover text
    marker_line_color='white', # line markers between states
    colorbar_title="% of State Smoking in 2019"
))

fig.update_layout(
    title_text='Percentage of State Who Are Current Smokers <br>(Hover for breakdown)',
    geo = dict(
        scope='usa',
        projection=go.layout.geo.Projection(type = 'albers usa'),
        showlakes=True, # lakes
        lakecolor='rgb(255, 255, 255)'),
)

fig.show()

In [None]:
import plotly.express as px

In [None]:
## percieved health across age
df = recoded_health_behaviour_df[['_AGE80', 'PA2MIN_']].copy()
df.head()

In [None]:
df = df.dropna(axis=0, how = 'any')

In [None]:
##TODO: remove outliers of PA2MIN_
df = df[((df.PA2MIN_ - df.PA2MIN_.mean()) / df.PA2MIN_.std()).abs() < 3]

In [None]:
fig = px.scatter(df, x='_AGE80', y='PA2MIN_')
fig.show()

In [None]:
#TODO: this is no good obviously^
#new plan: make a stacked bar chart for age groupts with _PACAT2 showing % highly active, active, insufficient and inactive

In [None]:
#TODO: make a dataframe that contains the demographic information of the sample and their health outcomes 
#Draw up summary of sample

In [None]:
#map health outcomes accross states

In [None]:
#create a large df with demographic and health behaviour info and create model that can predict health outcomes