In [1]:
#import dependencies 
import psycopg2
import pandas as pd
import numpy as np

In [2]:
from getpass import getpass
password = getpass("Enter database password")

param_dic = {
    "host" : "localhost",
    "database" : "BRFSSAnalysis",
    "user" : "postgres",
    "password" : password
}

Enter database password········


In [3]:
def connect(params_dic):
    """
    Tranform a SELECT query into a pandas dataframe
    """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn

In [4]:
def postgresql_to_dataframe(conn, select_query, column_names):
    """
    SELECT * from question_info
    """
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error: %s" % error)
        cursor.close()
        return 1
    
    # Naturally we get a list of tupples
    tupples = cursor.fetchall()
    cursor.close()
    
    # We just need to turn it into a pandas dataframe
    df = pd.DataFrame(tupples, columns=column_names)
    return df

In [5]:
# Connect to the database
conn = connect(param_dic)
column_names = ["id", "var_name", "label", "text"]
# Execute the "SELECT *" query to save question_info as a datafram
question_info_df = postgresql_to_dataframe(conn, "select * from question_info", column_names)
question_info_df.head()

Connecting to the PostgreSQL database...
Connection successful


Unnamed: 0,id,var_name,label,text
0,1,_STATE,State FIPS Code,State FIPS Code
1,2,FMONTH,File Month,File Month
2,3,IDATE,Interview Date,Interview Date
3,4,IMONTH,Interview Month,Interview Month
4,5,IDAY,Interview Day,Interview Day


In [6]:
# Connect to the database and repeat process for question_values
conn = connect(param_dic)
column_names = ["id", "question_id", "label", "value", "value_end"]
# Execute the "SELECT *" query to save question_info as a datafram
answer_info_df = postgresql_to_dataframe(conn, "select * from question_values", column_names)
answer_info_df.head()

Connecting to the PostgreSQL database...
Connection successful


Unnamed: 0,id,question_id,label,value,value_end
0,1,1,Alabama,1,
1,2,1,Alaska,2,
2,3,1,Arizona,4,
3,4,1,Arkansas,5,
4,5,1,California,6,


In [7]:
#determing which columns of user answers (aka relevant answers) to bring over
a = question_info_df['label'].values.tolist()
b = question_info_df['var_name'].values.tolist()

In [8]:
zip_iterator = zip(b, a)
a_dictionary = dict(zip_iterator)

In [9]:
for key, value in a_dictionary.items():
    print(key, ' : ', value)

_STATE  :  State FIPS Code
FMONTH  :  File Month
IDATE  :  Interview Date
IMONTH  :  Interview Month
IDAY  :  Interview Day
IYEAR  :  Interview Year
DISPCODE  :  Final Disposition
SEQNO  :  Annual Sequence Number
_PSU  :  Primary Sampling Unit
CTELENM1  :  Correct telephone number?
PVTRESD1  :  Private Residence?
COLGHOUS  :  Do you live in college housing?
STATERE1  :  Resident of State
CELPHONE  :  Cellular Telephone
LADULT1  :  Are you 18 years of age or older?
COLGSEX  :  Are you male or female?
NUMADULT  :  Number of Adults in Household
LANDSEX  :  Are you male or female?
NUMMEN  :  Number of Adult men in Household
NUMWOMEN  :  Number of Adult women in Household
RESPSLCT  :  Respondent selection
SAFETIME  :  Safe time to talk
CTELNUM1  :  Correct Phone Number?
CELLFON5  :  Is this a cell phone?
CADULT1  :  Are you 18 years of age or older?
CELLSEX  :  Are you male or female?
PVTRESD3  :  Do you live in a private residence?
CCLGHOUS  :  Do you live in college housing?
CSTATE1  :  D

In [60]:
#select columns to include from list above for health behaviour features predicting general, mental and physical health
columns = ["id", "_STATE", "GENHLTH", "PHYSHLTH", "MENTHLTH", "POORHLTH", "EXRACT11", "PAFREQ1_", "_MINAC11", "ACTIN12_", "EXRACT21", "PAFREQ2_", "_MINAC21", "ACTIN22_",  "STRFREQ_", "PA2MIN_", "_METSTAT", "_URBSTAT", "HTIN4", "WTKG3", "_BMI5", "_SMOKER3", "_DRNKWK1", "FC601_", "FTJUDA2_",  "GRENDA1_", "VEGEDA2_", "POTADA1_", "FRNCHDA_", "_FRUTSU1", "_VEGESU1", "_PAINDX2", "_PASTRNG", "_AGE80"]                 

In [61]:
# Connect to the database and repeat process for user_answers: limit of 500
conn = connect(param_dic)
column_names = columns
# Execute the "SELECT *" query to save question_info as a datafram
health_behaviour_df = postgresql_to_dataframe(conn, f"select id, _STATE, GENHLTH, PHYSHLTH, MENTHLTH, POORHLTH, EXRACT11, PAFREQ1_, _MINAC11, ACTIN12_, EXRACT21, PAFREQ2_, _MINAC21, ACTIN22_,  STRFREQ_, PA2MIN_, _METSTAT, _URBSTAT, HTIN4, WTKG3, _BMI5, _SMOKER3, _DRNKWK1, FC601_, FTJUDA2_,  GRENDA1_, VEGEDA2_, POTADA1_, FRNCHDA_, _FRUTSU1, _VEGESU1,  _PAINDX2, _PASTRNG, _AGE80 from user_answers limit 500", column_names)


Connecting to the PostgreSQL database...
Connection successful


In [62]:
health_behaviour_df.head()

Unnamed: 0,id,_STATE,GENHLTH,PHYSHLTH,MENTHLTH,POORHLTH,EXRACT11,PAFREQ1_,_MINAC11,ACTIN12_,...,FTJUDA2_,GRENDA1_,VEGEDA2_,POTADA1_,FRNCHDA_,_FRUTSU1,_VEGESU1,_PAINDX2,_PASTRNG,_AGE80
0,139027,21.0,2.0,88.0,88.0,,69.0,1000.0,0.0,0.0,...,0.0,29.0,71.0,14.0,14.0,29.0,128.0,2.0,1.0,47.0
1,139028,21.0,1.0,88.0,88.0,,64.0,,,1.0,...,14.0,0.0,29.0,29.0,3.0,71.0,61.0,9.0,2.0,30.0
2,139029,21.0,3.0,88.0,15.0,3.0,,,,,...,17.0,0.0,33.0,7.0,0.0,17.0,40.0,2.0,2.0,42.0
3,139030,21.0,4.0,88.0,7.0,77.0,,,,,...,3.0,3.0,3.0,0.0,0.0,5.0,6.0,2.0,2.0,25.0
4,139031,21.0,3.0,88.0,88.0,,31.0,2000.0,60.0,1.0,...,0.0,14.0,43.0,7.0,7.0,43.0,71.0,2.0,2.0,41.0


In [63]:
#filling blank spaces with NaN
health_behaviour_df = health_behaviour_df.fillna(value=np.nan)

In [64]:
#Make a copy of health_behaviour_df to perform value recoding on
recoded_health_behaviour_df = health_behaviour_df.copy()

In [65]:
print(health_behaviour_df.isnull().sum())

id            0
_STATE        0
GENHLTH       0
PHYSHLTH      0
MENTHLTH      0
POORHLTH    202
EXRACT11    178
PAFREQ1_    326
_MINAC11    326
ACTIN12_    183
EXRACT21    183
PAFREQ2_    326
_MINAC21    196
ACTIN22_    197
STRFREQ_     22
PA2MIN_     200
_METSTAT      0
_URBSTAT      0
HTIN4         5
WTKG3        21
_BMI5        24
_SMOKER3      0
_DRNKWK1      0
FC601_        0
FTJUDA2_     26
GRENDA1_     29
VEGEDA2_     28
POTADA1_     30
FRNCHDA_     24
_FRUTSU1     34
_VEGESU1     42
_PAINDX2      0
_PASTRNG      0
_AGE80        0
dtype: int64


## TODO: go through features and correct poorly coded values (e.g. 9999 = no answer)
clean up DF for null values
clean up poorly encoded variables

### GENHLTH 
#### question: Would you say that in general your health is:

Answers currently coded as : 
   - 1: Excellent
   - 2: Very Good
   - 3: Good
   - 4: Fair
   - 5: Poor
   - 7: Don't know/Not Sure
   - 9: Refused
   - BLANK: Not asked or Missing
   
Recode to: 
   - Nan: Don't know/Not Sure
   - Nan: Refused
   - BLANK: Not asked or Missing 
   - 1: Poor
   - 2: Fair
   - 3: Good
   - 4: Very Good
   - 5: Excellent
   

In [66]:
#print out original gen health values for reference and check data type
health_behaviour_df['GENHLTH'].head(20)

0     2.0
1     1.0
2     3.0
3     4.0
4     3.0
5     1.0
6     3.0
7     4.0
8     3.0
9     4.0
10    4.0
11    2.0
12    1.0
13    2.0
14    2.0
15    3.0
16    3.0
17    3.0
18    2.0
19    1.0
Name: GENHLTH, dtype: object

In [67]:
#convert column to int
recoded_health_behaviour_df.GENHLTH = pd.to_numeric(recoded_health_behaviour_df.GENHLTH).astype(int)
recoded_health_behaviour_df['GENHLTH'].dtypes

dtype('int32')

In [68]:
#recode GENHLTH values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 1., "GENHLTH"] = 5
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 2., "GENHLTH"] = 4
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 3., "GENHLTH"] = 3
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 4., "GENHLTH"] = 2
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 5., "GENHLTH"] = 1
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 7., "GENHLTH"] = np.NaN
recoded_health_behaviour_df.loc[health_behaviour_df.GENHLTH == 9., "GENHLTH"] = np.NaN

In [69]:
recoded_health_behaviour_df.GENHLTH = pd.to_numeric(recoded_health_behaviour_df.GENHLTH).astype('Int32')

In [70]:
#print to confirm values were recoded correctly
recoded_health_behaviour_df['GENHLTH'].head(20)

0     4
1     5
2     3
3     2
4     3
5     5
6     3
7     2
8     3
9     2
10    2
11    4
12    5
13    4
14    4
15    3
16    3
17    3
18    4
19    5
Name: GENHLTH, dtype: Int32

In [71]:
print(recoded_health_behaviour_df.isnull().sum())

id            0
_STATE        0
GENHLTH       2
PHYSHLTH      0
MENTHLTH      0
POORHLTH    202
EXRACT11    178
PAFREQ1_    326
_MINAC11    326
ACTIN12_    183
EXRACT21    183
PAFREQ2_    326
_MINAC21    196
ACTIN22_    197
STRFREQ_     22
PA2MIN_     200
_METSTAT      0
_URBSTAT      0
HTIN4         5
WTKG3        21
_BMI5        24
_SMOKER3      0
_DRNKWK1      0
FC601_        0
FTJUDA2_     26
GRENDA1_     29
VEGEDA2_     28
POTADA1_     30
FRNCHDA_     24
_FRUTSU1     34
_VEGESU1     42
_PAINDX2      0
_PASTRNG      0
_AGE80        0
dtype: int64


### PA2MIN_
#### Question:  Minutes of total Physical Activity per week
Originally coded as: 
- 0-99999: Minutes of Activity per week
- BLANK: not asked or missing
    

In [72]:
#checking column values and check data type
recoded_health_behaviour_df['PA2MIN_'].head(20)

0       60.0
1        0.0
2        NaN
3        NaN
4      120.0
5       60.0
6        NaN
7        0.0
8      224.0
9        0.0
10    2160.0
11      56.0
12     270.0
13    1440.0
14     120.0
15     420.0
16      90.0
17       NaN
18       NaN
19     210.0
Name: PA2MIN_, dtype: object

In [73]:
recoded_health_behaviour_df.PA2MIN_ = pd.to_numeric(recoded_health_behaviour_df.PA2MIN_).astype('Int32')
recoded_health_behaviour_df['PA2MIN_'].dtypes

Int32Dtype()

In [74]:
recoded_health_behaviour_df['PA2MIN_'].head(20)

0       60
1        0
2     <NA>
3     <NA>
4      120
5       60
6     <NA>
7        0
8      224
9        0
10    2160
11      56
12     270
13    1440
14     120
15     420
16      90
17    <NA>
18    <NA>
19     210
Name: PA2MIN_, dtype: Int32

### PHYSHLTH
#### Question:  Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good?
Answers originally coded as: 
- 1-30: Number of Days, numeric
- 88: None
- 77: Don't know/Not sure
- 99: Refused
- Blank: Not asked or missing

Recode to:

- 1-30: Number of Days, int
- 0: None
- Nan: Don't know/Not sure
- Nan: Refused
- Blank: Not asked or missing


In [75]:
#print column values for reference and check data type
recoded_health_behaviour_df['PHYSHLTH'].head(20)

0     88.0
1     88.0
2     88.0
3     88.0
4     88.0
5     88.0
6      1.0
7     88.0
8      9.0
9     88.0
10    15.0
11    88.0
12    88.0
13    88.0
14    88.0
15     7.0
16    77.0
17     2.0
18    88.0
19     2.0
Name: PHYSHLTH, dtype: object

In [76]:
# recode PHYSHLTH values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df.PHYSHLTH == 88, "PHYSHLTH"] = 0
recoded_health_behaviour_df.loc[health_behaviour_df.PHYSHLTH == 77, "PHYSHLTH"] = np.NaN
recoded_health_behaviour_df.loc[health_behaviour_df.PHYSHLTH == 99, "PHYSHLTH"] = np.NaN

In [77]:
#convert column from object to int
recoded_health_behaviour_df.PHYSHLTH = pd.to_numeric(recoded_health_behaviour_df.PHYSHLTH).astype('Int32')

In [78]:
#reprint values to confirm they were correctly recoded
recoded_health_behaviour_df['PHYSHLTH'].head(20)

0        0
1        0
2        0
3        0
4        0
5        0
6        1
7        0
8        9
9        0
10      15
11       0
12       0
13       0
14       0
15       7
16    <NA>
17       2
18       0
19       2
Name: PHYSHLTH, dtype: Int32

### MENTHLTH
#### Question:  Now thinking about your mental health, which includes stress, depression, and problems with emotions, for how many days during the past 30 days was your mental health not good?
Answers originally coded as: 
- 1-30: Number of Days, numeric
- 88: None
- 77: Don't know/Not sure
- 99: Refused
- Blank: Not asked or missing

Recode to:

- 1-30: Number of Days, int
- 0: None
- Nan: Don't know/Not sure
- Nan: Refused
- Blank: Not asked or missing


In [79]:
#print column values for reference and check data type
recoded_health_behaviour_df['MENTHLTH'].head(20)

0     88.0
1     88.0
2     15.0
3      7.0
4     88.0
5      7.0
6     88.0
7     10.0
8     88.0
9     88.0
10     1.0
11    10.0
12    10.0
13    88.0
14     1.0
15    20.0
16    88.0
17     2.0
18    88.0
19    88.0
Name: MENTHLTH, dtype: object

In [80]:
# recode PHYSHLTH values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df.MENTHLTH == 88, "MENTHLTH"] = 0
recoded_health_behaviour_df.loc[health_behaviour_df.MENTHLTH == 77, "MENTHLTH"] = np.NaN
recoded_health_behaviour_df.loc[health_behaviour_df.MENTHLTH == 99, "MENTHLTH"] = np.NaN

In [81]:
#convert column from object to int
recoded_health_behaviour_df.MENTHLTH = pd.to_numeric(recoded_health_behaviour_df.MENTHLTH).astype('Int32')

In [82]:
recoded_health_behaviour_df['MENTHLTH'].dtypes

Int32Dtype()

In [83]:
#reprint values to confirm they were correctly recoded
recoded_health_behaviour_df['MENTHLTH'].head(20)

0      0
1      0
2     15
3      7
4      0
5      7
6      0
7     10
8      0
9      0
10     1
11    10
12    10
13     0
14     1
15    20
16     0
17     2
18     0
19     0
Name: MENTHLTH, dtype: Int32

In [84]:
recoded_health_behaviour_df['MENTHLTH'].unique()

<IntegerArray>
[0, 15, 7, 10, 1, 20, 2, 30, 3, 27, 14, 5, <NA>, 25, 17, 4, 6, 18, 13, 23]
Length: 20, dtype: Int32

### _PAINDX2
#### Question:  Physical Activity Index
Values originally coded as:
- 1: Meet Aerobic Recommendations
- 2: Did Not Meet Aerobic Recommendations
- 9: Don’t know/Not Sure/Refused/Missing

Recode to:
- 1: Meet Aerobic Recommendations
- 2: Did Not Meet Aerobic Recommendations
- Nan: Don’t know/Not Sure/Refused/Missing
    

In [85]:
recoded_health_behaviour_df['_PAINDX2'].head(20)

0     2.0
1     9.0
2     2.0
3     2.0
4     2.0
5     2.0
6     2.0
7     9.0
8     1.0
9     9.0
10    1.0
11    2.0
12    1.0
13    1.0
14    2.0
15    1.0
16    2.0
17    9.0
18    9.0
19    1.0
Name: _PAINDX2, dtype: object

In [86]:
# recode values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df._PAINDX2 == 9, "_PAINDX2"] = np.NaN
recoded_health_behaviour_df._PAINDX2 = pd.to_numeric(recoded_health_behaviour_df._PAINDX2).astype('Int32')

In [87]:
recoded_health_behaviour_df['_PAINDX2'].unique()

<IntegerArray>
[2, <NA>, 1]
Length: 3, dtype: Int32

### _PASTRNG
#### Question:  Muscle Strengthening Recommendation
Originally coded as:
- 1: Meet muscle strengthening recommendations
- 2: Did not meet muscle strengthening recommendations
- 9: Don’t know/Not Sure/Refused/Missing

Recode to:
- 1: Meet muscle strengthening recommendations
- 2: Did not meet muscle strengthening recommendations
- Nan: Don’t know/Not Sure/Refused/Missing

In [88]:
recoded_health_behaviour_df['_PASTRNG'].unique()

array([Decimal('1.0'), Decimal('2.0'), Decimal('9.0')], dtype=object)

In [89]:
# recode values to new coding scheme described above
recoded_health_behaviour_df.loc[health_behaviour_df._PASTRNG == 9, "_PASTRNG"] = np.NaN
recoded_health_behaviour_df._PASTRNG = pd.to_numeric(recoded_health_behaviour_df._PASTRNG).astype('Int32')

In [90]:
recoded_health_behaviour_df['_PASTRNG'].unique()

<IntegerArray>
[1, 2, <NA>]
Length: 3, dtype: Int32

### _FRUTSU1
#### Question:  Total fruits consumed per day
Originally coded as: 
- 0-99998: Number of Fruits consumed per day (two implied decimal places)
- BLANK: Not asked or Missing

Recode to:
- 0.00-999.98: Number of Fruits consumed per day
- Nan: Not asked or Missing

In [91]:
recoded_health_behaviour_df['_FRUTSU1'].head(20)

0      29.0
1      71.0
2      17.0
3       5.0
4      43.0
5     100.0
6     100.0
7     243.0
8     400.0
9      93.0
10    114.0
11      0.0
12    200.0
13     53.0
14      7.0
15     29.0
16     29.0
17    100.0
18      NaN
19    300.0
Name: _FRUTSU1, dtype: object

In [92]:
#TODO: convert to int to remove decimal place, convert to string then interate through and insert decimal before last two digits, convert to float

In [93]:
recoded_health_behaviour_df['_FRUTSU1'] = recoded_health_behaviour_df['_FRUTSU1'].fillna(100000).astype(float)

In [94]:
recoded_health_behaviour_df['_FRUTSU1'].head(20)

0         29.0
1         71.0
2         17.0
3          5.0
4         43.0
5        100.0
6        100.0
7        243.0
8        400.0
9         93.0
10       114.0
11         0.0
12       200.0
13        53.0
14         7.0
15        29.0
16        29.0
17       100.0
18    100000.0
19       300.0
Name: _FRUTSU1, dtype: float64

In [95]:
recoded_health_behaviour_df['_FRUTSU1'] = recoded_health_behaviour_df['_FRUTSU1'].div(100).round(2)

In [96]:
recoded_health_behaviour_df['_FRUTSU1'].head(20)

0        0.29
1        0.71
2        0.17
3        0.05
4        0.43
5        1.00
6        1.00
7        2.43
8        4.00
9        0.93
10       1.14
11       0.00
12       2.00
13       0.53
14       0.07
15       0.29
16       0.29
17       1.00
18    1000.00
19       3.00
Name: _FRUTSU1, dtype: float64

In [97]:
recoded_health_behaviour_df['_FRUTSU1'] = recoded_health_behaviour_df['_FRUTSU1'].mask(np.isclose(recoded_health_behaviour_df['_FRUTSU1'].values, 1000.00))

In [98]:
recoded_health_behaviour_df['_FRUTSU1'].head(20)

0     0.29
1     0.71
2     0.17
3     0.05
4     0.43
5     1.00
6     1.00
7     2.43
8     4.00
9     0.93
10    1.14
11    0.00
12    2.00
13    0.53
14    0.07
15    0.29
16    0.29
17    1.00
18     NaN
19    3.00
Name: _FRUTSU1, dtype: float64

###  _VEGESU1
#### Question:  Total vegetables consumed per day
Originally coded as: 
- 0-99998: Number of Vegetables consumed per day (two implied decimal places)
- BLANK: Not asked or Missing

Recode to:
- 0.00-999.98: Number of Vegetables consumed per day
- Nan: Not asked or Missing

In [99]:
recoded_health_behaviour_df['_VEGESU1'].head(20)

0     128.0
1      61.0
2      40.0
3       6.0
4      71.0
5     286.0
6     300.0
7     172.0
8     257.0
9      40.0
10    300.0
11     67.0
12    329.0
13    122.0
14    158.0
15    100.0
16    215.0
17     88.0
18      NaN
19    279.0
Name: _VEGESU1, dtype: object

In [100]:
recoded_health_behaviour_df['_VEGESU1'] = recoded_health_behaviour_df['_VEGESU1'].fillna(100000).astype(float)
recoded_health_behaviour_df['_VEGESU1'] = recoded_health_behaviour_df['_VEGESU1'].div(100).round(2)
recoded_health_behaviour_df['_VEGESU1'] = recoded_health_behaviour_df['_VEGESU1'].mask(np.isclose(recoded_health_behaviour_df['_VEGESU1'].values, 1000.00))
recoded_health_behaviour_df['_VEGESU1'].head(20)

0     1.28
1     0.61
2     0.40
3     0.06
4     0.71
5     2.86
6     3.00
7     1.72
8     2.57
9     0.40
10    3.00
11    0.67
12    3.29
13    1.22
14    1.58
15    1.00
16    2.15
17    0.88
18     NaN
19    2.79
Name: _VEGESU1, dtype: float64

## TODO: get mean values per state for chloropleth maps

In [101]:
recoded_health_behaviour_df.groupby('_STATE')['_VEGESU1'].mean()

_STATE
21.0    1.706528
Name: _VEGESU1, dtype: float64

### State Codes
original code: 
- 1: Alabama
- 2	Alaska	
- 4	Arizona	
- 5	Arkansas	
- 6	California	
- 8	Colorado	
- 9	Connecticut	
- 10	Delaware	
- 11	District of Columbia	
- 12	Florida	
- 13	Georgia	
- 15	Hawaii	
- 16	Idaho	
- 17	Illinois	
- 18	Indiana	
- 19	Iowa	
- 20	Kansas	
- 21	Kentucky	
- 22	Louisiana	
- 23	Maine	
- 24	Maryland	
- 25	Massachusetts	
- 26	Michigan	
- 27	Minnesota	
- 28	Mississippi	
- 29	Missouri	
- 30	Montana	
- 31	Nebraska	
- 32	Nevada	
- 33	New Hampshire	
- 35	New Mexico	
- 36	New York	
- 37	North Carolina	
- 38	North Dakota	
- 39	Ohio	
- 40	Oklahoma	
- 41	Oregon	
- 42	Pennsylvania	
- 44	Rhode Island	
- 45	South Carolina	
- 46	South Dakota	
- 47	Tennessee	
- 48	Texas	
- 49	Utah	
- 50	Vermont	
- 51	Virginia	
- 53	Washington	
- 54	West Virginia	
- 55	Wisconsin	
- 56	Wyoming
- 66	Guam	
- 72	Puerto Rico
    

In [None]:
##TODO: test on larger dataset

### Health Behaviours Chloropleth maps by state

In [59]:
import plotly.express as px
#https://plotly.com/python/choropleth-maps/



fig = px.choropleth(locations=["KY"], locationmode="USA-states", color=[1.80927], scope="usa")
fig.show()

In [105]:
## percieved health across age
df = recoded_health_behaviour_df[['_AGE80', 'PA2MIN_']].copy()
df.head()

Unnamed: 0,_AGE80,PA2MIN_
0,47.0,60.0
1,30.0,0.0
2,42.0,
3,25.0,
4,41.0,120.0


In [116]:
df = df.dropna(axis=0, how = 'any')

In [118]:
df['PA2MIN_'].values

NameError: name 'PA2MIN_' is not defined

In [117]:
fig = px.scatter(df, x='_AGE80', y='PA2MIN_')
fig.show()

In [None]:
#TODO: make a dataframe that contains the demographic information of the sample and their health outcomes 
#Draw up summary of sample

In [None]:
#map health outcomes accross states

In [None]:
#create a large df with demographic and health behaviour info and create model that can predict health outcomes