# Getting all socioeconomic information

This is the aggregate notebook for all the socioeconomic information needed for the paper

NOTE: datasets are slightly smaller due to the exclusion of US territories and other FIPS codes that did not get SES information

## gathering data

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px # choropleth maps
from IPython.display import Image
import os
import sys
import json

## needed in tis notebook
import scipy

# n grams
from sklearn.feature_extraction.text import CountVectorizer

raw = pd.read_csv('flair.joined.tweets.csv')

# changing date to more readable format
raw['created_at'] = pd.to_datetime(raw['created_at'])

# getting seperated date and time columns
raw['date'] = raw['created_at'].dt.date
raw['week'] = raw['created_at'].dt.week
raw['time'] = raw['created_at'].dt.time
raw['hour'] = raw['created_at'].dt.hour

min_wk = raw['week'].min()
max_wk = raw['week'].max()

min_date = raw['date'].min()
max_date = raw['date'].max()

# changing na to None
raw = raw.fillna('None')

raw = raw[raw['sentiment'] != 'None']

raw['is_negative'] = [1 if 'NEGATIVE' in str(sent) else 0 for sent in raw['sentiment']]
raw['is_positive'] = [1 if 'POSITIVE' in str(sent) else 0 for sent in raw['sentiment']]


def convert_fips(unknown_fips_list, fin):
    
    def convert(fipsline):
        name, postal, fips= fipsline.strip().split('\t')

        return {fips: name}

    fips_dict = {f: n 
                for dic in [convert(line)for line in open(fin, 'r')]
                for f, n in dic.items()}

    return [fips_dict[f'{unknown:02d}'] for unknown in unknown_fips_list]

raw = raw[raw['statefips'] != 'None']

raw['state_name'] = convert_fips([int(float(fips)) for fips in raw['statefips']],
                     '../analysis/functions/state.fips.txt')

us_state_abbrev = json.load(open('state_abbreviations.json'))

raw['state_abv'] = [us_state_abbrev[s] for s in raw['state_name']]

raw['fips'] = [f'{int(float(state)):02d}{int(float(county)):03d}' for i, state, county in raw[['statefips', 'countyfips']].itertuples()]
raw['count'] = 1
# raw.head()

all_splits = pd.to_datetime(['2020-03-05', '2020-03-26', '2020-05-01'])
raw['stage'] = ['s:1' if date < all_splits[0] else 's:2' if date < all_splits[1] else 's:3' if date < all_splits[2] else 's:4' for date in raw['date']]



#########################
# thresholds
thresholds = [0.8, 0.9, 0.95]
for thresh in thresholds:
    raw[f't:{thresh}'] = [1 if float(conf) > thresh else 0 for conf in raw['sent_confidence']]

col = 't:0.8'
raw['category'] = ['NEUT' if t == 0 else 'POS' if p == 1 else 'NEG' for i, n, p, t in raw[['is_negative', 'is_positive', col]].itertuples()]
raw['is_neutral'] = [1 if cat == 'NEUT' else 0 for cat in raw['category']]
raw['is_positive'] = [1 if cat == 'POS' else 0 for cat in raw['category']]
raw['is_negative'] = [1 if cat == 'NEG' else 0 for cat in raw['category']]

raw['is_neutral2'] = raw['is_neutral']
raw['is_positive2'] = raw['is_positive']
raw['is_negative2'] = raw['is_negative']

raw['is_neutral3'] = raw['is_neutral']
raw['is_positive3'] = raw['is_positive']
raw['is_negative3'] = raw['is_negative']

##################
# cdc

keywords = ['social distancing', 'social distance', 'physical distance', '6 feet', 'stay at home', 'school isolation', 'isolation', 'stay home', 'avoid touching', 'mask', 'covering', 'face shield', 'wear a mask', 'surgical mask', 'N95 respirator', 'wearing gloves', 'face shields', 'facial covering', 'skin protection', 'eye protection', 'ppe', 'wash hands', 'hand sanitizer', 'disinfect', 'clean', 'detergent', 'handwashing', 'hand hygiene', 'prevention hygiene', 'sprays', 'concentrates', 'wipes', 'routine cleaning', 'bleach solution', 'test', 'business closure']

topics = []
indexes = {i:0 for i in range(len(raw.index))}

for i, topic in enumerate(keywords):
    topics.append([1 if topic in text else 0 for text in raw['ogtext']])

for i, kw in enumerate(topics):
    for data_ind, value in enumerate(topics[i]):
        indexes[data_ind] += value

raw['in_cdc'] = [indexes[i] if 0 <= indexes[i] <= 1 else 1 for i in indexes]

cdc = raw[raw['in_cdc'] == 1]

cdc_splits = pd.to_datetime(['2020-03-12', '2020-04-20', '2020-05-14'])
cdc['stage'] = ['s:1' if date < cdc_splits[0] else 's:2' if date < cdc_splits[1] else 's:3' if date < cdc_splits[2] else 's:4' for date in cdc['date']]

print('raw data loaded...')

  raw['week'] = raw['created_at'].dt.week


raw data loaded...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cdc['stage'] = ['s:1' if date < cdc_splits[0] else 's:2' if date < cdc_splits[1] else 's:3' if date < cdc_splits[2] else 's:4' for date in cdc['date']]


In [2]:
income = pd.read_csv('fips_unemp_medhh.csv')
income.head()

Unnamed: 0,FIPStxt,Stabr,area_name,Rural_urban_continuum_code_2013,Urban_influence_code_2013,Metro_2013,Civilian_labor_force_2000,Employed_2000,Unemployed_2000,Unemployment_rate_2000,...,Civilian_labor_force_2018,Employed_2018,Unemployed_2018,Unemployment_rate_2018,Civilian_labor_force_2019,Employed_2019,Unemployed_2019,Unemployment_rate_2019,Median_Household_Income_2018,Med_HH_Income_Percent_of_State_Total_2018
0,0,US,United States,,,,142601667,136904680,5696987,4.0,...,161389026,155102319,6286707,3.9,163100055,157115247,5984808,3.7,61937,
1,1000,AL,Alabama,,,,2133223,2035594,97629,4.6,...,2216627,2130845,85782,3.9,2241747,2174483,67264,3.0,49881,100.0
2,1001,AL,"Autauga County, AL",2.0,2.0,1.0,21720,20846,874,4.0,...,26196,25261,935,3.6,26172,25458,714,2.7,59338,119.0
3,1003,AL,"Baldwin County, AL",3.0,2.0,1.0,69533,66971,2562,3.7,...,95233,91809,3424,3.6,97328,94675,2653,2.7,57588,115.5
4,1005,AL,"Barbour County, AL",6.0,6.0,0.0,11373,10748,625,5.5,...,8414,7987,427,5.1,8537,8213,324,3.8,34382,68.9


In [3]:
poverty = pd.read_csv('fips_poverty_est.csv')
poverty.head()

Unnamed: 0,State FIPS Code,County FIPS Code,Postal Code,Name,"Poverty Estimate, All Ages",90% CI Lower Bound,90% CI Upper Bound,"Poverty Percent, All Ages",90% CI Lower Bound.1,90% CI Upper Bound.1,...,90% CI Upper Bound.5,Median Household Income,90% CI Lower Bound.6,90% CI Upper Bound.6,"Poverty Estimate, Age 0-4",90% CI Lower Bound.7,90% CI Upper Bound.7,"Poverty Percent, Age 0-4",90% CI Lower Bound.8,90% CI Upper Bound.8
0,0,0,US,United States,39490096,39248096,39732096,12.3,12.2,12.4,...,16.0,65712,65594,65830,3457689,3405854,3509524,18.2,17.9,18.5
1,1,0,AL,Alabama,747478,730491,764465,15.6,15.2,16.0,...,21.6,51771,51179,52363,69236,65296,73176,24.2,22.8,25.6
2,1,1,AL,Autauga County,6723,5517,7929,12.1,9.9,14.3,...,19.4,58233,52517,63949,.,.,.,.,.,.
3,1,3,AL,Baldwin County,22360,18541,26179,10.1,8.4,11.8,...,17.2,59871,54593,65149,.,.,.,.,.,.
4,1,5,AL,Barbour County,5909,4787,7031,27.1,22.0,32.2,...,49.0,35972,31822,40122,.,.,.,.,.,.


## merging datasets with SES data

In [4]:
income['fips'] = [f'{int(float(state)):05d}' for i, state in income[['FIPStxt']].itertuples()]

stats_df = income[income['fips'].isin(raw['fips'].unique())][['fips', 'Unemployment_rate_2019', 'Median_Household_Income_2018', 'Med_HH_Income_Percent_of_State_Total_2018']]

stats_df.columns = ['fips', 'unemp_rate', 'median_hh', '%_state_total']

stats_df['median_hh'] = stats_df['median_hh'].replace(',','', regex = True)
stats_df['fips'] = stats_df['fips'].astype('str')
income_raw = pd.merge(raw, stats_df, on='fips')

# had to pull peurto rico stats
income_raw = income_raw[~income_raw['median_hh'].isna()]
income_raw['median_hh'] = income_raw['median_hh'].astype('int')

income_cdc = income_raw[income_raw['in_cdc'] == 1]

income_raw.head()

Unnamed: 0,id_str,created_at,follower_count,friends_count,statefips,countyfips,countyname,is_urban,ogtext,cleantext,...,is_neutral2,is_positive2,is_negative2,is_neutral3,is_positive3,is_negative3,in_cdc,unemp_rate,median_hh,%_state_total
0,id:1226763665036075008,2020-02-10 07:03:44+00:00,270.0,539.0,32.0,3.0,Clark,1,having a mocha and avoiding coronavirus at the...,have mocha avoid coronavirus airport,...,0,1,0,0,1,0,0,4.0,57155,97.3
1,id:1226773799342706689,2020-02-10 07:44:00+00:00,312.0,125.0,32.0,3.0,Clark,1,sadly nothing will be done to help the people ...,sadly help people china communist government u...,...,0,0,1,0,0,1,0,4.0,57155,97.3
2,id:1226327068389998592,2020-02-09 02:08:51+00:00,2887.0,831.0,32.0,3.0,Clark,1,the world is grieving for the wuhan doctor who...,world grieving wuhan doctor try warn colleague...,...,0,0,1,0,0,1,0,4.0,57155,97.3
3,id:1233697011758587907,2020-02-29 10:14:23+00:00,7.0,169.0,32.0,3.0,Clark,1,risk of global coronavirus spread very high wa...,risk global coronavirus spread high warn china...,...,0,0,1,0,0,1,0,4.0,57155,97.3
4,id:1233697205405437953,2020-02-29 10:15:09+00:00,7.0,169.0,32.0,3.0,Clark,1,coronavirus what are the chances of dying,coronavirus chances die,...,0,0,1,0,0,1,0,4.0,57155,97.3


In [5]:
poverty = poverty[['State FIPS Code', 'County FIPS Code', 'Poverty Percent, All Ages']]
poverty['statefips'] = [str(n).zfill(2) for n in poverty['State FIPS Code']]
poverty['countyfips'] = [str(n).zfill(3) for n in poverty['County FIPS Code']]
poverty['fips'] = [f'{s}{c}' for i,s,c in poverty[['statefips', 'countyfips']].itertuples()]
poverty = poverty[poverty['fips'].isin(raw['fips'].unique())]
poverty = poverty[['fips', 'Poverty Percent, All Ages']]
poverty.columns = ['fips', 'poverty_estimate']
poverty['poverty_estimate'] = poverty['poverty_estimate'].replace(',','', regex = True)
poverty_raw = pd.merge(raw, poverty, on='fips')
poverty_raw['poverty_estimate'] = poverty_raw['poverty_estimate'].astype(float)

poverty_cdc = poverty_raw[poverty_raw['in_cdc'] == 1]

poverty_raw.head()

Unnamed: 0,id_str,created_at,follower_count,friends_count,statefips,countyfips,countyname,is_urban,ogtext,cleantext,...,category,is_neutral,is_neutral2,is_positive2,is_negative2,is_neutral3,is_positive3,is_negative3,in_cdc,poverty_estimate
0,id:1226763665036075008,2020-02-10 07:03:44+00:00,270.0,539.0,32.0,3.0,Clark,1,having a mocha and avoiding coronavirus at the...,have mocha avoid coronavirus airport,...,POS,0,0,1,0,0,1,0,0,13.3
1,id:1226773799342706689,2020-02-10 07:44:00+00:00,312.0,125.0,32.0,3.0,Clark,1,sadly nothing will be done to help the people ...,sadly help people china communist government u...,...,NEG,0,0,0,1,0,0,1,0,13.3
2,id:1226327068389998592,2020-02-09 02:08:51+00:00,2887.0,831.0,32.0,3.0,Clark,1,the world is grieving for the wuhan doctor who...,world grieving wuhan doctor try warn colleague...,...,NEG,0,0,0,1,0,0,1,0,13.3
3,id:1233697011758587907,2020-02-29 10:14:23+00:00,7.0,169.0,32.0,3.0,Clark,1,risk of global coronavirus spread very high wa...,risk global coronavirus spread high warn china...,...,NEG,0,0,0,1,0,0,1,0,13.3
4,id:1233697205405437953,2020-02-29 10:15:09+00:00,7.0,169.0,32.0,3.0,Clark,1,coronavirus what are the chances of dying,coronavirus chances die,...,NEG,0,0,0,1,0,0,1,0,13.3


### stats on total dataframes

#### Unemployment Rates

In [6]:
print('unemployment rate stats ---------')
print('raw COVID-19 dataset:')
print(income_raw['unemp_rate'].describe())
print()
print('CDC dataset:')
print(income_cdc['unemp_rate'].describe())

unemployment rate stats ---------
raw COVID-19 dataset:
count    343657.000000
mean          3.669417
std           1.009196
min           1.700000
25%           3.000000
50%           3.500000
75%           4.100000
max          18.300000
Name: unemp_rate, dtype: float64

CDC dataset:
count    53215.000000
mean         3.670001
std          1.006160
min          1.800000
25%          3.000000
50%          3.500000
75%          4.100000
max         18.300000
Name: unemp_rate, dtype: float64


#### Median Household Income

In [7]:
print('median household income stats ---------')
print('raw COVID-19 dataset:')
print(income_raw['median_hh'].describe())
print()
print('CDC dataset:')
print(income_cdc['median_hh'].describe())

median household income stats ---------
raw COVID-19 dataset:
count    343657.000000
mean      67771.367116
std       18476.932479
min       28024.000000
25%       54210.000000
50%       63636.000000
75%       78714.000000
max      140382.000000
Name: median_hh, dtype: float64

CDC dataset:
count     53215.000000
mean      67896.560951
std       18445.246025
min       29380.000000
25%       54980.000000
50%       63755.000000
75%       78777.000000
max      140382.000000
Name: median_hh, dtype: float64


#### Poverty Rate Estimates

In [8]:
print('poverty estimate stats ---------')
print('raw COVID-19 dataset:')
print(poverty_raw['poverty_estimate'].describe())
print()
print('CDC dataset:')
print(poverty_cdc['poverty_estimate'].describe())

poverty estimate stats ---------
raw COVID-19 dataset:
count    343657.000000
mean         12.387980
std           4.045306
min           2.700000
25%           9.500000
50%          12.600000
75%          14.200000
max          47.700000
Name: poverty_estimate, dtype: float64

CDC dataset:
count    53215.000000
mean        12.364397
std          4.039146
min          2.700000
25%          9.500000
50%         12.600000
75%         14.100000
max         40.100000
Name: poverty_estimate, dtype: float64


## getting aggregated county dataframes

In [14]:
counties_income_raw = income_raw.groupby(['fips']).agg({'is_negative':'mean', 'is_positive':'mean', 'is_neutral':'mean', 'median_hh':'mean', 'unemp_rate':'mean', 'state_name':'max', 'count':'count'}).reset_index()
counties_income_raw = counties_income_raw[counties_income_raw['count'] > 15]
print(len(counties_income_raw.index))

counties_income_cdc = income_cdc.groupby(['fips']).agg({'is_negative':'mean', 'is_positive':'mean', 'is_neutral':'mean', 'median_hh':'mean', 'unemp_rate':'mean', 'state_name':'max', 'count':'count'}).reset_index()
counties_income_cdc = counties_income_cdc[counties_income_cdc['count'] > 15]
print(len(counties_income_cdc.index))

counties_poverty_raw = poverty_raw.groupby(['fips']).agg({'is_negative':'mean', 'is_positive':'mean', 'is_neutral':'mean', 'poverty_estimate':'mean', 'state_name':'max', 'count':'count'}).reset_index()
counties_poverty_raw = counties_poverty_raw[counties_poverty_raw['count'] > 15]
print(len(counties_poverty_raw.index))

counties_poverty_cdc = poverty_cdc.groupby(['fips']).agg({'is_negative':'mean', 'is_positive':'mean', 'is_neutral':'mean', 'poverty_estimate':'mean', 'state_name':'max', 'count':'count'}).reset_index()
counties_poverty_cdc = counties_poverty_cdc[counties_poverty_cdc['count'] > 15]
print(len(counties_poverty_cdc.index))

909
413
909
413


### stats on number of tweets per county

In [16]:
print('number of tweets per county ------------')
print('raw COVID-19 dataset:')
print(counties_income_raw['count'].describe())
print()
print('CDC dataset:')
print(counties_income_cdc['count'].describe())

number of tweets per county ------------
raw COVID-19 dataset:
count      909.000000
mean       371.941694
std       1052.947177
min         16.000000
25%         31.000000
50%         80.000000
75%        269.000000
max      20168.000000
Name: count, dtype: float64

CDC dataset:
count     413.000000
mean      119.242131
std       229.782536
min        16.000000
25%        24.000000
50%        47.000000
75%       120.000000
max      3157.000000
Name: count, dtype: float64


### stats on proportion of sentiment per county

In [17]:
print('proportion of negative sentiment per county ------------')
print('raw COVID-19 dataset:')
print(counties_income_raw['is_negative'].describe())
print()
print('CDC dataset:')
print(counties_income_cdc['is_negative'].describe())
print()

proportion of negative sentiment per county ------------
raw COVID-19 dataset:
count    909.000000
mean       0.576876
std        0.087701
min        0.260870
25%        0.531250
50%        0.574468
75%        0.621951
max        0.904762
Name: is_negative, dtype: float64

CDC dataset:
count    413.000000
mean       0.611819
std        0.088573
min        0.153846
25%        0.564057
50%        0.612121
75%        0.666667
max        0.875000
Name: is_negative, dtype: float64



In [18]:
print('proportion of positive sentiment per county ------------')
print('raw COVID-19 dataset:')
print(counties_income_raw['is_positive'].describe())
print()
print('CDC dataset:')
print(counties_income_cdc['is_positive'].describe())
print()

proportion of positive sentiment per county ------------
raw COVID-19 dataset:
count    909.000000
mean       0.292193
std        0.082306
min        0.000000
25%        0.250000
50%        0.293515
75%        0.328622
max        0.736842
Name: is_positive, dtype: float64

CDC dataset:
count    413.000000
mean       0.244578
std        0.081727
min        0.000000
25%        0.203252
50%        0.236559
75%        0.281250
max        0.807692
Name: is_positive, dtype: float64



In [19]:
print('proportion of neutral sentiment per county ------------')
print('raw COVID-19 dataset:')
print(counties_income_raw['is_neutral'].describe())
print()
print('CDC dataset:')
print(counties_income_cdc['is_neutral'].describe())
print()

proportion of neutral sentiment per county ------------
raw COVID-19 dataset:
count    909.000000
mean       0.130932
std        0.049360
min        0.000000
25%        0.106061
50%        0.129083
75%        0.154664
max        0.391304
Name: is_neutral, dtype: float64

CDC dataset:
count    413.000000
mean       0.143603
std        0.057125
min        0.000000
25%        0.108014
50%        0.139535
75%        0.175000
max        0.380952
Name: is_neutral, dtype: float64



### stats on aggregated county-wide information

#### Unemployment Rate

In [18]:
print('unemployment rate stats ---------')
print('raw COVID-19 dataset:')
print(counties_income_raw['unemp_rate'].describe())
print()
print('CDC dataset:')
print(counties_income_cdc['unemp_rate'].describe())

unemployment rate stats ---------
raw COVID-19 dataset:
count    909.000000
mean       3.780858
std        1.257632
min        1.800000
25%        3.000000
50%        3.600000
75%        4.300000
max       18.300000
Name: unemp_rate, dtype: float64

CDC dataset:
count    413.000000
mean       3.660775
std        1.202409
min        1.800000
25%        2.900000
50%        3.500000
75%        4.100000
max       16.400000
Name: unemp_rate, dtype: float64


#### Median Household Income

In [17]:
print('median household income stats ---------')
print('raw COVID-19 dataset:')
print(counties_income_raw['median_hh'].describe())
print()
print('CDC dataset:')
print(counties_income_cdc['median_hh'].describe())

median household income stats ---------
raw COVID-19 dataset:
count       909.000000
mean      61489.355336
std       16394.213113
min       31741.000000
25%       50285.000000
50%       58057.000000
75%       68734.000000
max      140382.000000
Name: median_hh, dtype: float64

CDC dataset:
count       413.000000
mean      66516.326877
std       17888.071611
min       33989.000000
25%       53161.000000
50%       62817.000000
75%       76067.000000
max      140382.000000
Name: median_hh, dtype: float64


#### Poverty Estimates

In [19]:
print('poverty estimate stats ---------')
print('raw COVID-19 dataset:')
print(counties_poverty_raw['poverty_estimate'].describe())
print()
print('CDC dataset:')
print(counties_poverty_cdc['poverty_estimate'].describe())

poverty estimate stats ---------
raw COVID-19 dataset:
count    909.000000
mean      12.460836
std        4.680864
min        2.700000
25%        9.100000
50%       12.000000
75%       15.200000
max       31.500000
Name: poverty_estimate, dtype: float64

CDC dataset:
count    413.000000
mean      11.852058
std        4.529271
min        2.700000
25%        8.500000
50%       11.500000
75%       14.200000
max       31.100000
Name: poverty_estimate, dtype: float64


# Getting R and P values for all SES

## Unemployment Rate

In [27]:
cats = 'negative positive neutral'.split(' ')

print('unemployment rate stats ---------')
print('raw COVID-19 dataset:')
for cat in cats:
    x= counties_income_raw[f'is_{cat}']
    y= counties_income_raw['unemp_rate']
    
    print(f'{cat}:   r                  p')
    print(scipy.stats.pearsonr(x, y))    # Pearson's r
    print()

print('--------------------------------------')
print('CDC dataset:')
for cat in cats:
    x= counties_income_cdc[f'is_{cat}']
    y= counties_income_cdc['unemp_rate']
    
    print(f'{cat}:   r                  p')
    print(scipy.stats.pearsonr(x, y))    # Pearson's r
    print()

unemployment rate stats ---------
raw COVID-19 dataset:
negative:   r                  p
(0.09824397122521507, 0.003025659075575843)

positive:   r                  p
(-0.1406501118556454, 2.0822276931001285e-05)

neutral:   r                  p
(0.05997204543099846, 0.07071961898324369)

--------------------------------------
CDC dataset:
negative:   r                  p
(0.07707123863340176, 0.11785225426695678)

positive:   r                  p
(-0.06611716406259854, 0.17990263619454538)

neutral:   r                  p
(-0.02490889629069099, 0.6137325646768036)



## Median Household Income

In [28]:
cats = 'negative positive neutral'.split(' ')

print('median household income stats ---------')
print('raw COVID-19 dataset:')
for cat in cats:
    x= counties_income_raw[f'is_{cat}']
    y= counties_income_raw['median_hh']
    
    print(f'{cat}:   r                  p')
    print(scipy.stats.pearsonr(x, y))    # Pearson's r
    print()

print('--------------------------------------')
print('CDC dataset:')
for cat in cats:
    x= counties_income_cdc[f'is_{cat}']
    y= counties_income_cdc['median_hh']
    
    print(f'{cat}:   r                  p')
    print(scipy.stats.pearsonr(x, y))    # Pearson's r
    print()

median household income stats ---------
raw COVID-19 dataset:
negative:   r                  p
(-0.1322271423532856, 6.369585252392486e-05)

positive:   r                  p
(0.1553959969454878, 2.510096662018396e-06)

neutral:   r                  p
(-0.02418039276691351, 0.4665320954982118)

--------------------------------------
CDC dataset:
negative:   r                  p
(-0.020333241392785074, 0.6803317411473615)

positive:   r                  p
(0.03288836067882694, 0.5050755507069037)

neutral:   r                  p
(-0.015525177852362982, 0.7530869494479194)



## Poverty Estimates

In [29]:
cats = 'negative positive neutral'.split(' ')

print('poverty estimate stats ---------')
print('raw COVID-19 dataset:')
for cat in cats:
    x= counties_poverty_raw[f'is_{cat}']
    y= counties_poverty_raw['poverty_estimate']
    

    print(f'{cat}:   r                  p')
    print(scipy.stats.pearsonr(x, y))    # Pearson's r
    print()

print('--------------------------------------')
print('CDC dataset:')
for cat in cats:
    x= counties_poverty_cdc[f'is_{cat}']
    y= counties_poverty_cdc['poverty_estimate']
    
    print(f'{cat}:   r                  p')
    print(scipy.stats.pearsonr(x, y))    # Pearson's r
    print()

poverty estimate stats ---------
raw COVID-19 dataset:
negative:   r                  p
(0.046101683642716614, 0.16490135669612332)

positive:   r                  p
(-0.08355731884828414, 0.011730025207650648)

neutral:   r                  p
(0.05741652104513095, 0.08360667388703547)

--------------------------------------
CDC dataset:
negative:   r                  p
(-0.026115491044656637, 0.596658951003852)

positive:   r                  p
(-0.0038761374518536656, 0.9374033108847634)

neutral:   r                  p
(0.046038058928351465, 0.350684115711116)



# Urban / Rural T-Test information

## COVID-19 dataset

In [32]:
cats = 'negative positive neutral'.split(' ')

print('raw COVID-19 dataset -------------')

for cat in cats:
    print(f'{cat}:')

    urban = raw[raw['is_urban']==1].groupby(['fips']).agg({f'count':'sum', f'is_{cat}':'mean'}).reset_index()
    rural = raw[raw['is_urban']==0].groupby(['fips']).agg({f'count':'sum', f'is_{cat}':'mean'}).reset_index()

    urban = urban[urban[f'count'] > 15]
    rural = rural[rural[f'count'] > 15]

    print(f'len urban= {len(urban.index)}, len rural= {len(rural.index)}')
    print(f'meanUrban= {urban[f"is_{cat}"].mean()}, stdUrban= {urban[f"is_{cat}"].std()}')
    print(f'meanRural= {rural[f"is_{cat}"].mean()}, stdRural= {rural[f"is_{cat}"].std()}')

    from scipy.stats import ttest_ind
    t, p = ttest_ind(urban[f'is_{cat}'], rural[f'is_{cat}'], equal_var=False)
    print('         t                p')
    print(t, p)
    print()

raw COVID-19 dataset -------------
negative:
len urban= 830, len rural= 182
meanUrban= 0.5768273405328975, stdUrban= 0.08968712292612052
meanRural= 0.5833472394796978, stdRural= 0.1063719154051495
         t                p
-0.7691173158706105 0.44257817384399456

positive:
len urban= 830, len rural= 182
meanUrban= 0.29213852840404303, stdUrban= 0.08271359810864314
meanRural= 0.29182402353422937, stdRural= 0.09948418419024763
         t                p
0.039743138239857694 0.9683311113931015

neutral:
len urban= 830, len rural= 182
meanUrban= 0.13103413106305942, stdUrban= 0.05051981034437689
meanRural= 0.12482873698607284, stdRural= 0.058787628429605734
         t                p
1.3210763469296816 0.18771866425205697



## CDC Dataset

In [33]:
cats = 'negative positive neutral'.split(' ')

print('CDC dataset -------------')

for cat in cats:
    print(f'{cat}:')

    urban = cdc[cdc['is_urban']==1].groupby(['fips']).agg({f'count':'sum', f'is_{cat}':'mean'}).reset_index()
    rural = cdc[cdc['is_urban']==0].groupby(['fips']).agg({f'count':'sum', f'is_{cat}':'mean'}).reset_index()

    urban = urban[urban[f'count'] > 15]
    rural = rural[rural[f'count'] > 15]

    print(f'len urban= {len(urban.index)}, len rural= {len(rural.index)}')
    print(f'meanUrban= {urban[f"is_{cat}"].mean()}, stdUrban= {urban[f"is_{cat}"].std()}')
    print(f'meanRural= {rural[f"is_{cat}"].mean()}, stdRural= {rural[f"is_{cat}"].std()}')

    from scipy.stats import ttest_ind
    t, p = ttest_ind(urban[f'is_{cat}'], rural[f'is_{cat}'], equal_var=False)
    print('         t                p')
    print(t, p)
    print()

CDC dataset -------------
negative:
len urban= 355, len rural= 52
meanUrban= 0.6112191650318727, stdUrban= 0.08908389553737095
meanRural= 0.6543482085240492, stdRural= 0.07851853162870684
         t                p
-3.6332091678131966 0.0005226805793239376

positive:
len urban= 355, len rural= 52
meanUrban= 0.24547082218192756, stdUrban= 0.08224891919147614
meanRural= 0.21730468428056784, stdRural= 0.07157451983033206
         t                p
2.5976012601735405 0.011369514256739112

neutral:
len urban= 355, len rural= 52
meanUrban= 0.1433100127861997, stdUrban= 0.05654815836476976
meanRural= 0.12834710719538311, stdRural= 0.04113606841943789
         t                p
2.321311160150001 0.02274569131763659

