# Gathering aggregated state data

## loading in raw covid-19 dataset

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px # choropleth maps
from IPython.display import Image
import os
import sys
import json
# plotly maps
# import plotly.figure_factory as ff
# from urllib.request import urlopen

# sys.path.append(os.path.abspath("C:/Users/Alex/Documents/My Data Files/COVID Research/analysis/mention_2_hashtag.py"))

# from analysis.functions.mention_2_hashtag import * 
# from analysis.functions.datamanip import *

# n grams
from sklearn.feature_extraction.text import CountVectorizer

raw = pd.read_csv('flair.joined.tweets.csv')

# changing date to more readable format
raw['created_at'] = pd.to_datetime(raw['created_at'])

# getting seperated date and time columns
raw['date'] = raw['created_at'].dt.date
raw['week'] = raw['created_at'].dt.week
raw['time'] = raw['created_at'].dt.time
raw['hour'] = raw['created_at'].dt.hour

min_wk = raw['week'].min()
max_wk = raw['week'].max()

min_date = raw['date'].min()
max_date = raw['date'].max()

# changing na to None
raw = raw.fillna('None')

raw = raw[raw['sentiment'] != 'None']

raw['is_negative'] = [1 if 'NEGATIVE' in str(sent) else 0 for sent in raw['sentiment']]
raw['is_positive'] = [1 if 'POSITIVE' in str(sent) else 0 for sent in raw['sentiment']]


def convert_fips(unknown_fips_list, fin):
    
    def convert(fipsline):
        name, postal, fips= fipsline.strip().split('\t')

        return {fips: name}

    fips_dict = {f: n 
                for dic in [convert(line)for line in open(fin, 'r')]
                for f, n in dic.items()}

    return [fips_dict[f'{unknown:02d}'] for unknown in unknown_fips_list]

raw = raw[raw['statefips'] != 'None']

raw['state_name'] = convert_fips([int(float(fips)) for fips in raw['statefips']],
                     '../analysis/functions/state.fips.txt')

us_state_abbrev = json.load(open('state_abbreviations.json'))

raw['state_abv'] = [us_state_abbrev[s] for s in raw['state_name']]

raw['fips'] = [f'{int(float(state)):02d}{int(float(county)):03d}' for i, state, county in raw[['statefips', 'countyfips']].itertuples()]
raw['count'] = 1
# raw.head()

all_splits = pd.to_datetime(['2020-03-05', '2020-03-28', '2020-04-29'])
raw['stage'] = ['s:1' if date < all_splits[0] else 's:2' if date < all_splits[1] else 's:3' if date < all_splits[2] else 's:4' for date in raw['date']]



#########################
# thresholds
thresholds = [0.8, 0.9, 0.95]
for thresh in thresholds:
    raw[f't:{thresh}'] = [1 if float(conf) > thresh else 0 for conf in raw['sent_confidence']]

col = 't:0.8'
raw['category'] = ['NEUT' if t == 0 else 'POS' if p == 1 else 'NEG' for i, n, p, t in raw[['is_negative', 'is_positive', col]].itertuples()]
raw['is_neutral'] = [1 if cat == 'NEUT' else 0 for cat in raw['category']]
raw['is_positive'] = [1 if cat == 'POS' else 0 for cat in raw['category']]
raw['is_negative'] = [1 if cat == 'NEG' else 0 for cat in raw['category']]

raw['is_neutral2'] = raw['is_neutral']
raw['is_positive2'] = raw['is_positive']
raw['is_negative2'] = raw['is_negative']

raw['is_neutral3'] = raw['is_neutral']
raw['is_positive3'] = raw['is_positive']
raw['is_negative3'] = raw['is_negative']

##################
# cdc

keywords = ['social distancing', 'social distance', 'physical distance', '6 feet', 'stay at home', 'school isolation', 'isolation', 'stay home', 'avoid touching', 'mask', 'covering', 'face shield', 'wear a mask', 'surgical mask', 'N95 respirator', 'wearing gloves', 'face shields', 'facial covering', 'skin protection', 'eye protection', 'ppe', 'wash hands', 'hand sanitizer', 'disinfect', 'clean', 'detergent', 'handwashing', 'hand hygiene', 'prevention hygiene', 'sprays', 'concentrates', 'wipes', 'routine cleaning', 'bleach solution', 'test', 'business closure']

topics = []
indexes = {i:0 for i in range(len(raw.index))}

for i, topic in enumerate(keywords):
    topics.append([1 if topic in text else 0 for text in raw['ogtext']])

for i, kw in enumerate(topics):
    for data_ind, value in enumerate(topics[i]):
        indexes[data_ind] += value

raw['in_cdc'] = [indexes[i] if 0 <= indexes[i] <= 1 else 1 for i in indexes]

cdc = raw[raw['in_cdc'] == 1]

cdc_splits = pd.to_datetime(['2020-03-15', '2020-04-29', '2020-05-22'])
cdc['stage'] = ['s:1' if date < cdc_splits[0] else 's:2' if date < cdc_splits[1] else 's:3' if date < cdc_splits[2] else 's:4' for date in cdc['date']]

# print(f'there are a total of {len(cdc.index)} tweets after subsetting for cdc keywords')
# print(raw)
# print(cdc)
print('loaded...')

  raw['week'] = raw['created_at'].dt.week


loaded...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cdc['stage'] = ['s:1' if date < cdc_splits[0] else 's:2' if date < cdc_splits[1] else 's:3' if date < cdc_splits[2] else 's:4' for date in cdc['date']]


## creating aggregated state data for COVID-19 and CDC dataset

In [2]:
raw_states = raw.groupby(['state_abv']).agg({'count':'count', 'is_negative':'mean', 'is_negative2':'std', 'is_positive':'mean', 'is_positive2':'std', 'is_neutral':'mean', 'is_neutral2':'std'}).reset_index()
raw_states.columns = ['state', 'n', 'mean Neg', 'std Neg', 'mean Pos', 'std Pos', 'mean Neut', 'std Neut']
raw_states['data'] = "C19"

cdc_states = cdc.groupby(['state_abv']).agg({'count':'count', 'is_negative':'mean', 'is_negative2':'std', 'is_positive':'mean', 'is_positive2':'std', 'is_neutral':'mean', 'is_neutral2':'std'}).reset_index()
cdc_states.columns = ['state', 'n', 'mean Neg', 'std Neg', 'mean Pos', 'std Pos', 'mean Neut', 'std Neut']
cdc_states['data'] = "CDC"

states = pd.concat([raw_states, cdc_states])
states = states[['state', 'data', 'n', 'mean Neg', 'std Neg', 'mean Pos', 'std Pos', 'mean Neut', 'std Neut']]
states = states.sort_values(by=['state'])
states.to_csv('agg_state_c19_cdc.csv', index=False)
states

Unnamed: 0,state,data,n,mean Neg,std Neg,mean Pos,std Pos,mean Neut,std Neut
0,AK,C19,465,0.591398,0.492105,0.288172,0.453399,0.120430,0.325814
0,AK,CDC,73,0.561644,0.499619,0.328767,0.473016,0.109589,0.314539
1,AL,C19,3299,0.555623,0.496972,0.308578,0.461977,0.135799,0.342627
1,AL,CDC,475,0.562105,0.496651,0.244211,0.430071,0.193684,0.395601
2,AR,C19,1864,0.567060,0.495615,0.311159,0.463092,0.121781,0.327121
...,...,...,...,...,...,...,...,...,...
51,WI,C19,3633,0.586568,0.492517,0.282136,0.450101,0.131296,0.337771
52,WV,CDC,158,0.563291,0.497555,0.329114,0.471386,0.107595,0.310853
52,WV,C19,905,0.562431,0.496361,0.302762,0.459707,0.134807,0.341706
53,WY,C19,267,0.599251,0.490971,0.250936,0.434366,0.149813,0.357558


## Generating state map images

In [6]:
for datatype, data in states.groupby(['data']):
    if datatype == "C19":
        t_d = "COVID-19"
    else:
        t_d = "CDC"    
            
    for sent in ['Neg', 'Pos', 'Neut']:

        if sent == 'Neg':
            title = f'{t_d} - Proportion of Negative'
            col = 'Negative'
        if sent == 'Pos':
            title = f'{t_d} - Proportion of Positive'
            col = 'Positive'
        if sent == 'Neut':
            title = f'{t_d} - Proportion of Neutral'
            col = 'Neutral'

        data = data[~data['state'].isin(['GU','PR', 'VI'])]
        plot_data = data.groupby(['state']).agg({f'mean {sent}':'max'}).reset_index()
        plot_data.rename(columns={f'mean {sent}':col}, inplace=True)

        # print(datatype, sent)
        # print(plot_data)
        # print('-----------------')
        
        pl_map = px.choropleth(plot_data, locations='state', color=col, 
                                    scope="usa", locationmode="USA-states", hover_data=[col],
                                    title = title, color_continuous_scale = 'RdBu_r', range_color=(min(plot_data[col]), max(plot_data[col])))

        pl_map.update_layout(font=dict(size=8))

        pl_map.write_image(f'images/{sent}.{datatype}.map_8.png', engine="kaleido", width=500, height=300)

    col = 'N Tweets'
    plot_data = data.groupby(['state']).agg({f'n':'max'}).reset_index()
    plot_data.rename(columns={'n':col}, inplace=True)
    pl_map = px.choropleth(plot_data, locations='state', color=col, 
                                    scope="usa", locationmode="USA-states", hover_data=[col], title = f'{t_d} - Count of Tweets', color_continuous_scale = 'RdBu_r')
    pl_map.update_layout(font=dict(size=8))
    pl_map.write_image(f'images/count.{datatype}.map_8.png', engine="kaleido", width=500, height=300)

print('all maps updated')

all maps updated
