In [1]:
## Standard Imports
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import spacy 
import tweepy as tweepy
import csv, json
from spacy.lang.en.stop_words import STOP_WORDS
from textblob import TextBlob
## Importing the OS and JSON Modules
import os

#geolocator = Nominatim(user_agent="UCI_Tweet_Illness_App")
nlp = spacy.load('en_core_web_sm')

In [2]:
# import twitter api
with open('/Users/Zach/Documents/.secret/twitter_api.json') as f:
    login = json.load(f)
    
auth = tweepy.OAuthHandler(login['consumer_key'], login['consumer_secret'])
auth.set_access_token(login['access_token'], login['access_secret'])
 
twitter_api = tweepy.API(auth)

In [3]:
# this will tell us how many calls we have left
limits = twitter_api.rate_limit_status()
print('Calls Left: ')
print(limits['resources']['search']['/search/tweets']['remaining'])
reset_time = limits['resources']['search']['/search/tweets']['reset']
dt = datetime.fromtimestamp(reset_time)
print('\nTime Until Reset: ')
print(dt)

Calls Left: 
45

Time Until Reset: 
2023-02-27 16:47:21


In [14]:
# Query built by the health team

cold_flu_query = 'cold OR flu OR cough OR (runny nose) OR (stuffy nose) OR (sore throat) OR (muscle aches) OR headaches OR (body aches) OR (itchy throat) -filter:retweets'
pneu_bronc_query = 'pneumonia OR bronchitis OR sweating OR chills OR (shortness of breath) OR (difficulty breathing) OR (out of breath) OR (blood AND cough) since:2022-01-01 -filter:retweets'
location = "39.8,-95.583068847656,2500km"  # Geographical center of the US with 2500km radius
language = "en"
result_type = "recent"

In [15]:
# two queries, for low risk and one for high risk infections

cold_flu_tweets = tweepy.Cursor(twitter_api.search_tweets, 
                        q=cold_flu_query, 
                        geocode=location,
                        since_id='2022-01-01',
                        lang=language).items(1000)

pneu_bronc_tweets = tweepy.Cursor(twitter_api.search_tweets, 
                        q=pneu_bronc_query, 
                        geocode=location,
                        since_id='2022-01-01',
                        lang=language).items(1000)

In [16]:
# filter tweets and import into data frame

filtered_cold_flu_tweets = []

for tweet in cold_flu_tweets:
    if tweet.place is not None:
        filtered_cold_flu_tweets.append({
            'tweet_id': tweet.id,
            'text': tweet.text,
            'location': tweet.place.full_name,
            'date': tweet.created_at.date()
        })

filtered_pneu_bronc_tweets = []

for tweet in pneu_bronc_tweets:
    if tweet.place is not None:
        filtered_pneu_bronc_tweets.append({
            'tweet_id': tweet.id,
            'text': tweet.text,
            'location': tweet.place.full_name,
            'date': tweet.created_at.date()
        })

In [23]:
# figuring out how to not have duplicates in the file
# i'm reading in the file, inserting the 'new' tweets into the the file, then dropping the duplicates, then writing the file again
# it doesn't sound optimal, i should be verifying the tweet is new from the for loops up top but i feel like it requires to use a for loop in a for loop which doesn't sound good

cf_file = pd.read_csv('cold_flu.csv')
pb_file = pd.read_csv('pneu_bronc.csv')

low_risk_tweets = pd.DataFrame(filtered_cold_flu_tweets)
high_risk_tweets = pd.DataFrame(filtered_pneu_bronc_tweets)

new_cf_file = pd.concat([cf_file, low_risk_tweets], ignore_index=True)
new_pb_file = pd.concat([pb_file, high_risk_tweets], ignore_index=True)

new_cf_file.drop_duplicates(inplace=True)
new_pb_file.drop_duplicates(inplace=True)

In [24]:
new_cf_file.head()

Unnamed: 0,tweet_id,text,location,date
0,1630003285267472386,@softcorgamegirl Aren’t your legs cold? Lol Th...,"New York, USA",2023-02-27
1,1630003194783756288,Getting sick of the cold and snow. 1554 infuse...,"Eaton, CO",2023-02-27
2,1630003166514233345,I need a comforting #truecrime hug while I con...,"Union, KY",2023-02-27
3,1630002720605106176,Ever since I had this flu😒 my back is killing ...,"Miami, FL",2023-02-27
4,1630002532704751616,Shoutout to @djgeogeo for experiencing his fir...,"Fontana, CA",2023-02-27


In [25]:
new_cf_file.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  138 non-null    int64 
 1   text      138 non-null    object
 2   location  138 non-null    object
 3   date      138 non-null    object
dtypes: int64(1), object(3)
memory usage: 5.4+ KB


In [26]:
new_pb_file.head()

Unnamed: 0,tweet_id,text,location,date
0,1629997714531397635,So don’t ask how I know this…. But just becaus...,"Staten Island, NY",2023-02-27
1,1629991117495123968,@JuniorPalmtrees Had me sweating not going front,"Laurel, VA",2023-02-26
2,1629988525503553536,H pylori I got chills and my head start itchin...,"Santa Monica, CA",2023-02-26
3,1629986231869136897,"@amandadavey23 Fever, chills and body aches?? ...","New York, USA",2023-02-26
4,1629985968139776001,CHRIS. MF. KIRK. \n\nSweating that in person a...,"Palm Beach Gardens, FL",2023-02-26


In [27]:
new_pb_file.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 91 entries, 0 to 132
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tweet_id  91 non-null     int64 
 1   text      91 non-null     object
 2   location  91 non-null     object
 3   date      91 non-null     object
dtypes: int64(1), object(3)
memory usage: 3.6+ KB


In [22]:
# put data frame into a csv

new_cf_file.to_csv('cold_flu.csv', index=False)
new_pb_file.to_csv('pneu_bronc.csv', index=False)