In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import GetOldTweets3 as got
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime

## The function itself

I'll probably nest this in a functions.py file in the final project so it doesn't take up notebook space, but leaving it here for now so you can look through it easily, if you'd like! Scroll to the bottom to use it :)

In [2]:
def state_tweets_to_csv(query:str, tweets_per_iter:int, cities:dict, date_range:tuple, state:str):
    '''
    A function for returning search results on a query
    to create a representative sample of a state/region
    
    Parameters
    ----------
    query : string, a search query to be passed through
    Twitter's advanced search. Can use booleans within
    the query!
    
    tweets_per_iter : int, number of tweets to pull per iteration, 
    this is used to calculate the sleep time, may cause errors if using
    a number greater than 
    (recommend using 8000 as an upper limit)
    
    cities : dict, dictionary where the keys are [city, state abbreviation] 
    and the values are the distance around the city to search.
    Keys should be strings, values can be strings or integers.
    Not case-sensitive
    Example: {'chicago': 10, 'sPringfield': '20'}
    
    date_range : tuple, a range of dates as stringts to pull 
    tweets from, formatted as 'YYYY-MM-DD'. Put earliest date first. 
    Example: ('2020-03-20', '2020-03-25')
    
    state : string, enter the two-letter state code you are pulling info from.
    Not case-sensitive.
    '''
    # Check to make sure we won't trigger a timeout on Twitter
    if tweets_per_iter > 17999:
        raise Exception("Your max Tweet per iter must 17999 or lower")
    
    
    # Makes the data folder in the directory if you don't already have it
    os.makedirs('data', exist_ok=True)
    
    def csv_store(resultsAux):
        '''
        A function that is used within getTweets() as a receive buffer.
        This function stores a city's info in a .csv so if you hit a
        rate limit, your data gets saved.
        '''
        # Create dataframe from the temporary variable, resultsAux (comes from getTweets() source code)
        df = pd.DataFrame(t.__dict__ for t in resultsAux)
        
        def add_cols():
            # Add new columsn to the df
            df['city'] = city
            df['query'] = query
            df['date_range'] = str(date_range)
            df['state'] = state
            df['date'] = pd.to_datetime(df['date'], utc=True)
            df['month'] = df['date'].dt.month
            df['day'] = df['date'].dt.day
            
        # Is this the first city?
        if city == list(cities.keys())[0]:
            # Add new columns to the df and write to new .csv
            add_cols()
            # the .csv will be removed at the end of the whole function
            df.to_csv(f'./data/{city}_scrape_data.csv', index=False, mode='a')

        else:
            add_cols()
            # Don't need header for anything but the first city
            df.to_csv(f'./data/{city}_scrape_data.csv', index=False, mode='a', header=False)
            
    # Create a static timestamp to use for versioning
    timestamp = str(time.ctime().replace(' ', '_').replace(':', '_'))
    
    # Set state to uppercase for filenaming uniformity
    state = state.upper()
    
    
    #-----------------------------------------------------------------------------
    # Main search loop, developed by Eric Heidbreder, Haley Taft, Irene Anibogwu, and Steven Markoe
    
    # First, we need to set some variables and constants
    max_id = 1295148306117476352 # Set a starting max_id, thanks Steven Markoe for figuring out this approach!
    
    time_window = 810 # Twitter's request timer resets every 15 minutes, shaving off 1% to be safe, as tweets_per_iter seems to have some variation.
    max_tweets_per_time_window = 17999
    sleep_time = time_window * (tweets_per_iter / max_tweets_per_time_window)
    
    for city, area in cities.items():
        while True:
            # Make city lowercase for consitent file naming
            city = city.lower()

            # Try to get all tweets as determined by tweets_per_iter

            try:
                tweetCriteria = got.manager.TweetCriteria().setQuerySearch(f'{query} max_id:{max_id}')\
                                                   .setSince(date_range[0])\
                                                   .setUntil(date_range[1])\
                                                   .setMaxTweets(tweets_per_iter)\
                                                   .setNear(f'{city}, {state}')\
                                                   .setEmoji('unicode')\
                                                   .setWithin(f'{str(area)}mi')
                tweets = got.manager.TweetManager.getTweets(tweetCriteria, 
                                                            receiveBuffer=csv_store) # This receive buffer goes into the csv_store function defined above

                # Let's get the current city's csv that was created from the getTweets() receiveBuffer
                current_city = pd.read_csv(f'./data/{city}_scrape_data.csv')
                
                # Is this a full page of tweets? If not it means it's the last page
                if len(current_city) < (tweets_per_iter / 2):
                    print(f'Returned {len(current_city)} tweets, wrapping up work on {city}!')
                    # Save this data to the csv
                    current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False, header=False)
                    # Clean up the directory
                    os.remove(f'./data/{city}_scrape_data.csv')
                    break
                
                # Tell me how many tweets we collected
                print(f'Finished current iteration for {city}, we got {len(current_city)} tweets.')
                print(f'Waiting {sleep_time} seconds before next iteration')

                max_id = int(current_city.tail(1)['id'].values[0]) # HALEY TAFT FIGURED THIS OUT!

                # Is this the first city?
                if city == list(cities.keys())[0]:
                    # Create a .csv and put each city's data inside
                    current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False)
                    # Clean up the directory by removing the city's .csv
                    os.remove(f'./data/{city}_scrape_data.csv')

                    # Rest a random amount to try not to be detected as a bot
                    time.sleep(np.random.normal(sleep_time, 0.1))

                    # Set the new max id to the last id in our previous dataframe. This will start the new query at this id.
                    max_id = int(current_city.tail(1)['id'].values[0])

                # Is this the last city? Don't sleep after it!
                elif city == list(cities.keys())[-1]:  
                    # Don't need header for anything but the first city
                    current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False, header=False)
                    # Clean up the directory by removing the city's .csv
                    os.remove(f'./data/{city}_scrape_data.csv') 

                    max_id = int(current_city.tail(1)['id'].values[0])

                else:
                    # Don't need header for anything but the first city
                    current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False, header=False)
                    os.remove(f'./data/{city}_scrape_data.csv')

                    # Rest a random amount to try not to be detected as a bot
                    time.sleep(np.random.normal(sleep_time, 0.1))

                    max_id = int(current_city.tail(1)['id'].values[0])



            # If one of the searches didn't return anything, it won't create a .csv and will throw an error, let's account for that
            except FileNotFoundError:
                print(f'Found no tweets remaining for {city}, moving on to next city!')
                break

            # This is just a general catch-all for any other issues (including timeouts)
            except:

                # If there were errors above, we'll have to account for the missing .csvs with another try/except
                try:
                    # Let's get the current city's csv that was created above
                    current_city = pd.read_csv(f'./data/{city}_scrape_data.csv')

                    # Tell me how many tweets we collected
                    print(f'Encountered error, storing {len(current_city)} tweets from {city} and moving on.')
                    
                    # Is this a full page of tweets? If not, it means it's the last page
                    if len(current_city) < (tweets_per_iter / 2): 
                        current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False, header=False)
                        os.remove(f'./data/{city}_scrape_data.csv')
                        break

                    max_id = int(current_city.tail(1)['id'].values[0])

                    # Is this the first city?
                    if city == list(cities.keys())[0]:
                        # Create a .csv and put each city's data inside
                        current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False)
                        # Clean up the directory by removing the city's .csv
                        os.remove(f'./data/{city}_scrape_data.csv')

                    else:
                        # Don't need header for anything but the first city
                        current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False, header=False)
                        os.remove(f'./data/{city}_scrape_data.csv')

                        # Rest a random amount to try not to be detected as a bot
                        time.sleep(np.random.normal(sleep_time, 0.1))

                # If the .csv didn't exist, just sleep and go on to the next city!
                except:
                    time.sleep(np.random.normal(sleep_time, 0.1))
                    break
    
    try:
        # Clean up final df
        df_full = pd.read_csv(f'./data/{state}_scrape_data_{timestamp}.csv')
        df_full = df_full[(df_full['username'] != 'username')]
        df_full.dropna(subset=['text', 'date']) # There were some nulls in the text and date column that are likely the result of deleted/private tweets
    except:
        pass
        

## Picking a search query

We tried to **grab every tweet from each city**, but soon found that we wouldn't be able to do that (returned 20000 tweets from Chicago for just one day, and we want to search across a **2-week period**). We're also limited to the **number of terms** we can include in a query (testing showed it was somewhere around 25-40 words max, may be character based). We chose to build our query with words derived from the `top_words_il` dataframe, which contains the top words from Illinois after running the corpus through `CountVectorizer`. Illinois happened to be the first large dataframe we performed EDA on.

In [3]:
top_words_il = pd.read_csv('./data/top_words_cvec.csv')

In [4]:
term_list = top_words_il.head(20)['0'].tolist()
term_list.remove('illinois')

Our query contains **24 terms**: 
* 50% are neutral words with stop words removed
* 50% are Covid-19-related terms.

In [5]:
term_list = ['get', 'one', 'time', 'people', 'day', 'know', 'today', 'need', 'go', 'home', 'right', 'going', 
             'pandemic', 'coronavirus', 'news', 'health', 'covid19', '19', 'quarantine', 'governor', 'capitol', 'capital', 'corona', 'virus'] 

## Use this area to collect tweets!

In [60]:
# Building Illinois query
query = ' OR '.join(term_list) # Joining with OR so that we get tweets that contain those individual words rather than phrases
tweets_per_iter = 3600

# Picking wider ranges for more rural areas, shallower ranges for cities, used google maps to try not to overlap, but we can also check for duplicates afterward.
cities = {
          'springfield': 10,
          'chIcago': 10,
          'kewanee': 30,
          'rockford': 10,
          'freeport': 20,
          'vandalia': 50,
          'vermont': 20,
          'onarga': 20,
          'dixon': 20,
          'peoria': 10,
          'marion': 30,
          'marissa': 20,
          'highland park': 13,
          'gurnee': 10,
          'round lake': 5,
          'fox lake': 5,
          'marengo': 10,
          'galena': 2,
          'sterling': 15,
          'paw paw': 17,
          'naperville': 3,
          'aurora': 3,
          'bolingbrook': 3,
          'elgin': 10,
          'bristol': 10,
          'orland park': 5,
          'blue island': 5,
          'streator': 30,
          'monmouth': 15,
          'macomb': 13,
          'ripley': 18,
          'jacksonville': 2,
          'san jose': 34,
          'peoria': 5,
          'farmington': 10,
          'bloomington': 8,
          'melvin': 25,
          'champaign': 5,
          'tuscola': 15,
          'decatur': 5,
          'island grove': 26,
          'sumner': 10,
          'oblong': 10,
          'marshall': 2,
          'paris': 3,
          'danville': 2,
          'fairfield': 16,
          'mt carmel': 2,
          'enfield': 8,
          'harrisburg': 18,
          'pleasant grove': 12,
          'carbondale': 6,
          'pickneyville': 15,
          'hecker': 11,
          'st jacob': 20,
          'nokomis': 20
         }
date_range = ('2020-03-17', '2020-03-31') # 2 weeks total, starting 3 days before governor announced state shutdown. The 'Until' date is exclusive, so this range looks like 15 days
state = 'il'

In [None]:
# # Uncomment to run
# state_tweets_to_csv(query, tweets_per_iter, cities, date_range, state)

In [6]:
# Building Georgia Query
query = ' OR '.join(term_list) # Joining with OR so that we get tweets that contain those individual words rather than phrases
tweets_per_iter = 3600

# Picking wider ranges for more rural areas, shallower ranges for cities, used google maps to try not to overlap, but we can also check for duplicates afterward.
cities = {
          'resaca': 19,
          'east ellijay': 14,
          'cleveland': 30,
          'nicholson': 14,
          'athens': 6,
          'carlton': 17,
          'philomath': 31,
          'grovetown': 15,
          'Herndon': 25,
          'glennville': 20,
          'georgetown': 7,
          'brunswick': 6,
          'atkinson': 15,
          'sunnyside': 18,
          'douglas': 25,
          'tifton': 18,
          'rebecca': 8,
          'pineview': 12,
          'warner robins': 10,
          'macon': 4,
          'mcintyre': 10,
          'deepstep': 15,
          'round oak': 15,
          'jersey': 16,
          'windsor': 10,
          'cumming': 13,
          'atlanta': 5,
          'cartersville': 21,
          'temple': 16,
          'hogansville': 20,
          'waverly hall': 15,
          'buena vista': 10,
          'shellman': 15,
          'albany': 5,
          'rowena': 12,
          'branchville': 20,
          'bainbridge': 10,
          'pavo': 20,
          'valdosta': 8,
          'dublin': 5,
          'alamo': 5
         }
date_range = ('2020-03-30', '2020-04-13') # 2 weeks total, starting 3 days before governor announced state shutdown. The 'Until' date is exclusive, so this range looks like 15 days
state = 'ga'

In [None]:
# # uncomment to run
# state_tweets_to_csv(query, tweets_per_iter, cities, date_range, state)

Returned 318 tweets, wrapping up work on resaca!
Returned 72 tweets, wrapping up work on east ellijay!
Returned 178 tweets, wrapping up work on cleveland!
