## Understanding Seasonal Trends through NLP
- Abigail Bojorquez
- Zachary Romero

### Project Proposal: 
Analyzing Twitter data visualizes trends in common cold/flu symptoms vs pneumonia/bronchitis symptoms to educate health professionals.


In [3]:
## Standard Imports
import pandas as pd
import spacy 
import csv
import tweepy as tweepy
from spacy.lang.en.stop_words import STOP_WORDS

from textblob import TextBlob
## Importing the OS and JSON Modules
import os
import json

nlp = spacy.load('en_core_web_sm')

## Importing API Credentials

In [5]:
with open('/Users/zach/Documents/.secret/twitter_api.json') as f:
    login = json.load(f)
    
    login.keys()

In [6]:
auth = tweepy.OAuthHandler(login['consumer_key'], login['consumer_secret'])
auth.set_access_token(login['access_token'], login['access_secret'])
 
twitter_api = tweepy.API(auth)

In [7]:
# Quick Test Query
search_words = 'cold OR flu OR pneumonia OR bronchitis'

results = tweepy.Cursor(twitter_api.search_tweets, 
                        q=search_words, 
                        lang='en', 
                        since_id='2023-02-20').items(5)
for tweets in results:
    print(tweets.text)

@PlayoffShai Literally said “im back” that’s cold
RT @quickbitcrypto: watch as we review the @Trezor hardware wallet and show you how to set it up. The Trezor is a cold storage device for B…
RT @DrLoupis: Where did the flu go during COVID? https://t.co/HG8cOV7n0s
RT @KabukiSage: After a very long cold winter... We finally eating this year, brothers! https://t.co/7wAiW5vYZI
RT @RobMayeda: Now &amp; Next: Cold air pattern continues through midweek with showers at times, snow on the higher mtns and some hail/isolated…


In [8]:
# this will tell us how many
limits = twitter_api.rate_limit_status()
print('Calls Left: ')
print(limits['resources']['search']['/search/tweets']['remaining'])
reset_time = limits['resources']['search']['/search/tweets']['reset']
dt = datetime.fromtimestamp(reset_time)
print('\nTime Until Reset: ')
print(dt)

Calls Left: 
179

Time Until Reset: 
2023-03-05 16:37:17


In [6]:
# Quick Test Query

search_words = 'cold OR flu OR pneumonia OR bronchitis since:2022-01-01 -filter:retweets'
location = "39.8,-95.583068847656,2500km"  # Geographical center of the US with 2500km radius
language = "en"
result_type = "recent"

tweets = tweepy.Cursor(twitter_api.search_tweets, 
                        q=search_words, 
                        geocode=location,
                        lang=language).items(1000)

### Filters Tweets and Import into Data Frame

In [7]:
filtered_tweets = []

for tweet in tweets:
    if tweet.place is not None:
        filtered_tweets.append({
            'text': tweet.text,
            'location': tweet.place.full_name,
            'date': tweet.created_at.date()
        })
        
tweets_with_loc = pd.DataFrame(filtered_tweets)
tweets_with_loc.to_csv('test_tweets.csv', index=False)

## Creating Data Frame of Test_tweets

In [17]:
cf_tweets = pd.read_csv('./cold_flu.csv')
pb_tweets =pd.read_csv('./pneu_bronc.csv')

In [18]:
cf_tweets[['city', 'state']] = cf_tweets['location'].str.split(', ', expand=True)
pb_tweets[['city', 'state']] = pb_tweets['location'].str.split(', ', expand=True)
cf_tweets.drop('location', axis=1)
pb_tweets.drop('location', axis=1)

Unnamed: 0,tweet_id,text,date,city,state
0,1629997714531397635,So don’t ask how I know this…. But just becaus...,2023-02-27,Staten Island,NY
1,1629991117495123968,@JuniorPalmtrees Had me sweating not going front,2023-02-26,Laurel,VA
2,1629988525503553536,H pylori I got chills and my head start itchin...,2023-02-26,Santa Monica,CA
3,1629986231869136897,"@amandadavey23 Fever, chills and body aches?? ...",2023-02-26,New York,USA
4,1629985968139776001,CHRIS. MF. KIRK. \n\nSweating that in person a...,2023-02-26,Palm Beach Gardens,FL
...,...,...,...,...,...
445,1632256680481157120,He put a little extra sauce on that Jon Jones ...,2023-03-05,Toronto,Ontario
446,1632254763768115201,Jones already look out of breath lol #UFC285,2023-03-05,Miami,FL
447,1632249475426136064,I HAVE CHILLS HOLY SHIT #UFC285,2023-03-05,Plano,TX
448,1632248904908513280,i leave for cancun in 11 days chills,2023-03-05,Overland Park,KS


In [19]:
cf_state_counts = cf_tweets.groupby(['state', 'city']).size().reset_index(name='low_risk')
pb_state_counts = pb_tweets.groupby(['state', 'city']).size().reset_index(name='high_risk')

In [20]:
city_state_counts = pd.merge(cf_state_counts, pb_state_counts, on=['state', 'city'])
city_state_counts.head()

Unnamed: 0,state,city,low_risk,high_risk
0,AZ,Phoenix,33,8
1,AZ,Tempe,4,1
2,AZ,Tucson,9,1
3,Alberta,Edmonton,10,3
4,CA,Anaheim,15,2


In [21]:
county_df = pd.read_csv('./uscities.csv')
county_df.head()

Unnamed: 0,city,city_ascii,state_id,state_name,county_fips,county_name,lat,lng,population,density,source,military,incorporated,timezone,ranking,zips,id
0,New York,New York,NY,New York,36081,Queens,40.6943,-73.9249,18972871,10768.2,shape,False,True,America/New_York,1,11229 11226 11225 11224 11223 11221 11220 1138...,1840034016
1,Los Angeles,Los Angeles,CA,California,6037,Los Angeles,34.1141,-118.4068,12121244,3267.6,shape,False,True,America/Los_Angeles,1,90291 90293 90292 91316 91311 90035 90034 9003...,1840020491
2,Chicago,Chicago,IL,Illinois,17031,Cook,41.8375,-87.6866,8595181,4576.6,shape,False,True,America/Chicago,1,60018 60649 60641 60640 60643 60642 60645 6064...,1840000494
3,Miami,Miami,FL,Florida,12086,Miami-Dade,25.784,-80.2101,5711945,4945.7,shape,False,True,America/New_York,1,33128 33129 33125 33126 33127 33149 33144 3314...,1840015149
4,Dallas,Dallas,TX,Texas,48113,Dallas,32.7935,-96.7667,5668165,1522.2,shape,False,True,America/Chicago,1,75287 75098 75234 75254 75251 75252 75253 7503...,1840019440


In [22]:
county_df = county_df.drop(['city','lat','population','density','source','military','incorporated','timezone','ranking','zips','id'], axis=1)
county_df = county_df.rename(columns={'state_id': 'state'})
county_df = county_df.rename(columns={'city_ascii': 'city'})
county_df.head()

Unnamed: 0,city,state,state_name,county_fips,county_name,lng
0,New York,NY,New York,36081,Queens,-73.9249
1,Los Angeles,CA,California,6037,Los Angeles,-118.4068
2,Chicago,IL,Illinois,17031,Cook,-87.6866
3,Miami,FL,Florida,12086,Miami-Dade,-80.2101
4,Dallas,TX,Texas,48113,Dallas,-96.7667


In [55]:
merged_df = pd.merge(city_state_counts, county_df, on=['city','state'])
merged_df = merged_df.rename(columns={'county_name':'county', 'state_name' :'state'})
merged_df = merged_df.drop(['county_fips', 'lng'], axis=1)
merged_state_counts = merged_df.groupby(['county'], as_index=False).sum()
merged_df.head()

  merged_state_counts = merged_df.groupby(['county'], as_index=False).sum()


Unnamed: 0,state,city,low_risk,high_risk,state.1,county
0,AZ,Phoenix,33,8,Arizona,Maricopa
1,AZ,Tempe,4,1,Arizona,Maricopa
2,AZ,Tucson,9,1,Arizona,Pima
3,CA,Anaheim,15,2,California,Orange
4,CA,Beverly Hills,3,1,California,Los Angeles


In [53]:
json_obj = merged_state_counts.to_json(orient='records', indent=1)
with open('low_high_counts_county.json', 'w') as f:
    f.write(json_obj)