In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import GetOldTweets3 as got
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import time
from datetime import datetime

## The function itself

I'll probably nest this in a functions.py file in the final project so it doesn't take up notebook space, but leaving it here for now so you can look through it easily, if you'd like! Scroll to the bottom to use it :)

In [11]:
def state_tweets_to_csv(query:str, max_tweets:int, cities:dict, date_range:tuple, state:str, sleep_time:float=1.5):
    '''
    A function for returning search results on a query
    to create a representative sample of a state/region
    
    Parameters
    ----------
    query : string, a search query to be passed through
    Twitter's advanced search. Can use booleans within
    the query!
    
    max_tweets : int, number of tweets to pull, recommend
    staying within the boundaries of the twitter API limitations
    (recommend using 8000 as an upper limit)
    
    cities : dict, dictionary where the keys are [city, state abbreviation] 
    and the values are the distance around the city to search.
    Keys should be strings, values can be strings or integers.
    Not case-sensitive
    Example: {'chicago': 10, 'sPringfield': '20'}
    
    date_range : tuple, a range of dates as stringts to pull 
    tweets from, formatted as 'YYYY-MM-DD'. Put earliest date first. 
    Example: ('2020-03-20', '2020-03-25')
    
    state : string, enter the two-letter state code you are pulling info from.
    Not case-sensitive.
    '''
    # Makes the data folder in the directory if you don't already have it
    os.makedirs('data', exist_ok=True)
    
    def csv_store(resultsAux):
        '''
        A function that is used within getTweets() as a receive buffer.
        This function stores a city's info in a .csv so if you hit a
        rate limit, your data gets saved.
        '''
        # Create dataframe from the temporary variable, resultsAux (comes from getTweets() source code)
        df = pd.DataFrame(t.__dict__ for t in resultsAux)

        # Is this the first city?
        if city == list(cities.keys())[0]:
            # Add city column to this df and write to new .csv, 
            # the .csv will be removed at the end of the whole function
            df['city'] = city
            df['query'] = query
            df['date_range'] = str(date_range)
            df.to_csv(f'./data/{city}_scrape_data.csv', index=False, mode='a')

        else:
            # Don't need header for anything but the first city
            df['city'] = city
            df['query'] = query
            df['date_range'] = str(date_range)
            df.to_csv(f'./data/{city}_scrape_data.csv', index=False, mode='a', header=False)
            
        
    
    # Create a static timestamp to use for versioning
    timestamp = str(time.ctime().replace(' ', '_').replace(':', '_'))
    
    # Set state to uppercase for filenaming uniformity
    state = state.upper()
    
    for city, area in cities.items():
        # Make city lowercase for consitent file naming
        city = city.lower()
        
        # Try to get all tweets as determined by max_tweets
        try:
            tweetCriteria = got.manager.TweetCriteria().setQuerySearch(query)\
                                               .setSince(date_range[0])\
                                               .setUntil(date_range[1])\
                                               .setMaxTweets(max_tweets)\
                                               .setNear(f'{city}, {state}')\
                                               .setWithin(f'{str(area)}mi')
            tweets = got.manager.TweetManager.getTweets(tweetCriteria, 
                                                        receiveBuffer=csv_store) # This receive buffer goes into the csv_store function defined above

            # Let's get the current city's csv that was created above
            current_city = pd.read_csv(f'./data/{city}_scrape_data.csv')
            
            # Tell me how many tweets we collected
            print(f'Finished collecting tweets from {city}, we got {len(current_city)} tweets')

            # Is this the first city?
            if city == list(cities.keys())[0]:
                # Create a .csv and put each city's data inside
                current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False)
                # Clean up the directory by removing the city's .csv
                os.remove(f'./data/{city}_scrape_data.csv')
                
                # Rest a random amount to try not to be detected as a bot
                time.sleep(np.random.normal(sleep_time, 0.1))
            
            # Is this the last city? Don't sleep after it!
            elif city == list(cities.keys())[-1]:  
                # Don't need header for anything but the first city
                current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False, header=False)
                # Clean up the directory by removing the city's .csv
                os.remove(f'./data/{city}_scrape_data.csv')             
            
            else:
                # Don't need header for anything but the first city
                current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False, header=False)
                os.remove(f'./data/{city}_scrape_data.csv')
                
                # Rest a random amount to try not to be detected as a bot
                time.sleep(np.random.normal(sleep_time, 0.1))
                
           
        
        # If one of the searches didn't return anything, it won't create a .csv and will throw an error, let's account for that
        except FileNotFoundError:
            pass
        
        # This is just a general catch-all for any other issues (including timeouts)
        except:
        
            # If there were errors above, we'll have to account for the missing .csvs with another try/except
            try:
                # Let's get the current city's csv that was created above
                current_city = pd.read_csv(f'./data/{city}_scrape_data.csv')

                # Tell me how many tweets we collected
                print(f'Finished collecting tweets from {city}, we got {len(current_city)} tweets')

                # Is this the first city?
                if city == list(cities.keys())[0]:
                    # Create a .csv and put each city's data inside
                    current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False)
                    # Clean up the directory by removing the city's .csv
                    os.remove(f'./data/{city}_scrape_data.csv')

                else:
                    # Don't need header for anything but the first city
                    current_city.to_csv(f'./data/{state}_scrape_data_{timestamp}.csv', mode='a', index=False, header=False)
                    os.remove(f'./data/{city}_scrape_data.csv')

                    # Rest a random amount to try not to be detected as a bot
                    time.sleep(np.random.normal(sleep_time, 0.1))
            
            # If the .csv didn't exist, just sleep and go on to the next city!
            except:
                time.sleep(np.random.normal(sleep_time, 0.1))

## Use this area to collect tweets!

I haven't been able to grab many tweets from rural areas in a short date range, thinking about expanding date range before and after an announcement so that non-urban areas are better represented.

In [9]:
query = ''
max_tweets = 3600

# Picking wider ranges for more rural areas, shallower ranges for cities, used google maps to try not to overlap, but we can also check for duplicates afterward.
cities = {
          'springfield': 10,
          'chIcago': 10,
          'kewanee': 30,
          'rockford': 10,
          'freeport': 20,
          'vandalia': 50,
          'vermont': 20,
          'onarga': 20,
          'dixon': 20,
          'peoria': 10,
          'marion': 30,
          'marissa': 20,
          'highland park': 13,
          'gurnee': 10,
          'round lake': 5,
          'fox lake': 5,
          'marengo': 10,
          'galena': 2,
          'sterling': 15,
          'paw paw': 17,
          'naperville': 3,
          'aurora': 3,
          'bolingbrook': 3,
          'elgin': 10,
          'bristol': 10,
          'orland park': 5,
          'blue island': 5,
          'streator': 30,
          'monmouth': 15,
          'macomb': 13,
          'ripley': 18,
          'jacksonville': 2,
          'san jose': 34,
          'peoria': 5,
          'farmington': 10,
          'bloomington': 8,
          'melvin': 25,
          'champaign': 5,
          'tuscola': 15,
          'decatur': 5,
          'island grove': 26,
          'sumner': 10,
          'oblong': 10,
          'marshall': 2,
          'paris': 3,
          'danville': 2,
          'fairfield': 16,
          'mt carmel': 2,
          'enfield': 8,
          'harrisburg': 18,
          'pleasant grove': 12,
          'carbondale': 6,
          'pickneyville': 15,
          'hecker': 11,
          'st jacob': 20,
          'nokomis': 20
         }
date_range = ('2020-03-13', '2020-03-28')
state = 'il'

In [10]:
state_tweets_to_csv(query, max_tweets, cities, date_range, state, 180)

Finished collecting tweets from springfield, we got 3635 tweets
Finished collecting tweets from chicago, we got 3635 tweets
Finished collecting tweets from kewanee, we got 146 tweets
Finished collecting tweets from rockford, we got 3635 tweets
Finished collecting tweets from freeport, we got 467 tweets
Finished collecting tweets from vandalia, we got 158 tweets
Finished collecting tweets from vermont, we got 39 tweets
Finished collecting tweets from onarga, we got 71 tweets
Finished collecting tweets from dixon, we got 892 tweets
Finished collecting tweets from peoria, we got 2634 tweets
Finished collecting tweets from marion, we got 2904 tweets
Finished collecting tweets from marissa, we got 260 tweets
Finished collecting tweets from highland park, we got 3635 tweets
Finished collecting tweets from gurnee, we got 3635 tweets
Finished collecting tweets from round lake, we got 3635 tweets
Finished collecting tweets from fox lake, we got 3635 tweets
Finished collecting tweets from mareng

In [12]:
query = ''
max_tweets = 3600

# Picking wider ranges for more rural areas, shallower ranges for cities, used google maps to try not to overlap, but we can also check for duplicates afterward.
cities = {
          'resaca': 19,
          'east ellijay': 14,
          'cleveland': 30,
          'nicholson': 14,
          'athens': 6,
          'carlton': 17,
          'philomath': 31,
          'grovetown': 15,
          'Herndon': 25,
          'glennville': 20,
          'georgetown': 7,
          'brunswick': 6,
          'atkinson': 15,
          'sunnyside': 18,
          'douglas': 25,
          'tifton': 18,
          'rebecca': 8,
          'pineview': 12,
          'warner robins': 10,
          'macon': 4,
          'mcintyre': 10,
          'deepstep': 15,
          'round oak': 15,
          'jersey': 16,
          'windsor': 10,
          'cumming': 13,
          'atlanta': 5,
          'cartersville': 21,
          'temple': 16,
          'hogansville': 20,
          'waverly hall': 15,
          'buena vista': 10,
          'shellman': 15,
          'albany': 5,
          'rowena': 12,
          'branchville': 20,
          'bainbridge': 10,
          'pavo': 20,
          'valdosta': 8,
          'dublin': 5,
          'alamo': 5
         }
date_range = ('2020-03-25', '2020-04-09')
state = 'ga'

In [13]:
state_tweets_to_csv(query, max_tweets, cities, date_range, state, 180)

Finished collecting tweets from resaca, we got 1541 tweets
Finished collecting tweets from east ellijay, we got 220 tweets
Finished collecting tweets from cleveland, we got 663 tweets
Finished collecting tweets from nicholson, we got 3599 tweets
An error occured during an HTTP request: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond
Try to open in browser: https://twitter.com/search?q=%20near%3A%22athens%2C%20GA%22%20within%3A6mi%20since%3A2020-03-25%20until%3A2020-04-09&src=typd
Finished collecting tweets from athens, we got 1299 tweets
Finished collecting tweets from carlton, we got 503 tweets
Finished collecting tweets from philomath, we got 295 tweets
Finished collecting tweets from grovetown, we got 3599 tweets
Finished collecting tweets from herndon, we got 3599 tweets
Finished collecting tweets from glennville, we got 2892 tweets