In [31]:
#Import requisite modules
import sys
import pandas as pd
import numpy as np
import datetime
import os #Library module for .csv file check
from twitterscraper import query_tweets #if you haven't installed this module, run 'pip install twitterscraper' in your notebook

#---------------------------------------------------------------------

query_list = [ #This is our sample list, add or subtract as you see fit!
    'COVID',
    'COVID-19',
    'Corona',
    'Coronavirus',
    'Rona',
    'Quarantine',
    '#COVID',
    '#COVID-19',
    '#quarantine',
    '#Quarantine',
    '#covid19'
]
#--------------------------------------------------------------------

#Credit to Danielle Medellin, DSI11-NYC for the below implementation of custom parameter dictionary support
custom_params = {'Houston':{'lat'  : 29.760427,
                         'long' : -95.369804,
                         'radius': '15mi',
                         'queries' : ['rona','corona','covid']},
              'Detroit': {'city' : 'Detroit',
                          'lat'  : 42.331429,
                          'long' : -83.045753,
                          'radius' : '10mi',
                         'queries': ['stonks','tom nook','animal crossing']}
             }
#--------------------------------------------------------------------

#Get tweets without geolocation
def get_tweets(query): 
    tweets = {} #Instantiates a new empty dictionary
    count = 0 #Instantiates an index generator
    for tweet in query_tweets(query,begindate=datetime.date(2019,12,1)):
        chirp = {} #Instantiates a new empty dictionary for each pulled tweet
        chirp['tweet_id'] = tweet.tweet_id
        chirp['username'] = tweet.username
        chirp['text'] = tweet.text
        chirp['tweet_date'] = tweet.timestamp
        chirp['search_term'] = query
        chirp['city'] = np.NaN #Fills columns with NaNs for data cleaning at a later point. These items are NaNs so that
        chirp['lat'] = np.NaN #They will be flagged in an EDA search for missing values, instead of being strings with no information
        chirp['long'] = np.NaN
        chirp['radius'] = np.NaN
        tweets.update({count : chirp}) #Sets count value to tweets keys, so that index is automatically generated 
        count += 1 #increments index up by 1
    return tweets
#--------------------------------------------------------------------

#Get tweets with geolocation
def get_tweets_geoloc(query, city, lat, long, radius): #Geolocation parameters defined by user in master function or dictionary
    tweets = {}
    count = 0
    for tweet in query_tweets(f"{query}, geocode:{lat},{long},{radius}",begindate=datetime.date(2019,12,1)):
        chirp = {} #Generates tweet dictionary by calling on generated 'tweet' object attributes
        chirp['tweet_id'] = tweet.tweet_id
        chirp['username'] = tweet.username
        chirp['text'] = tweet.text
        chirp['tweet_date'] = tweet.timestamp
        chirp['search_term'] = query
        chirp['city'] = city
        chirp['lat'] = lat
        chirp['long'] = long
        chirp['radius'] = radius
        tweets.update({count : chirp}) #Sets count value to tweets keys, so that index is automatically generated 
        count += 1 #increments index up by 1
    return tweets
#--------------------------------------------------------------------

#Generate dataframe from "tweets" dictionary generated after each query
def make_dataframe(dictionary):
    df = pd.DataFrame.from_dict(dictionary, orient='index') #Creates a dataframe from the input dictionary 'tweets' later in function
    return df #creates a temporary dataframe for concatenation later
#--------------------------------------------------------------------

#Query function using custom parameters
#Credit Danielle Medellin for this code block section
def get_query_dataframe_cp(custom_params):
    query_df = pd.DataFrame() #instantiate an empty dataframe
    for key in custom_params.keys(): #Generates a new query dataframe for each city used in the parameter dictionary
        for query in custom_params[key]['queries']: #Runs a unique query for each unique term in the query key
            tweets = get_tweets_geoloc(query,custom_params[key],custom_params[key]['lat'],custom_params[key]['long'],custom_params[key]['radius'])
            df = make_dataframe(tweets) #creates temporary dataframe from independent query
            query_df = pd.concat([query_df,df],ignore_index = True) #concatenates temporary dataframe 'df' to master query dataframe 
    return query_df
#---------------------------------------------------------------

#Query function with geolocation but no custom parameters
def get_query_dataframe_geo(list_of_queries,city,lat,long,radius):
    query_df = pd.DataFrame()
    for query in list_of_queries: #Runs the same functions as custom parameters code
            tweets = get_tweets_geoloc(query,city,lat,long,radius)
            df = make_dataframe(tweets)
            query_df = pd.concat([query_df,df],ignore_index = True)
    return query_df
#-------------------------------------------------------------------

#Query function with no custom anything
def get_query_dataframe(list_of_queries):
    query_df = pd.DataFrame()
    for query in list_of_queries:
            tweets = get_tweets(query)
            df = make_dataframe(tweets)
            query_df = pd.concat([query_df,df],ignore_index = True)
    return query_df
#------------------------------------------------------------------

#Master function
def get_dataset():
    
    #Main function switches
    custom_params_switch = input("Are you using a custom parameter dictionary?")
    if str.lower(custom_params_switch) == 'yes':
        pass #Proceeds to CSV switches if user is using custom parameter dictionary
    else:
        geo_switch = input("Are you using geolocation?") #Checks if user wants to use geolocation if only using a query list
    
    #CSV parameter switches
    export_csv_switch = input("Do you want to export the final dataframe to csv?")
    if str.lower(export_csv_switch) == 'yes': #Enables CSV switch block later if yes as well
        custom_csv_name = input("Input CSV export file name:") #Prompts user for file name
        if os.path.exists(f'datasets/{custom_csv_name}.csv') == True: #Check if file has already been created. If yes, prompt user to overwrite or make new file.
            overwrite_check = input ("File already exists--do you want to overwrite?")
            if str.lower(overwrite_check) == 'yes':
                pass #skips through checks and overwrites file name
            else:
                new_csv_name = custom_csv_name #creates new_csv_name variable = to old name
                while new_csv_name == custom_csv_name: #continues to reject file name until a unique name is created
                    new_csv_name = input("Input new output file name:")
                custom_csv_name = new_csv_name #Sets the file name to the new user input
        else:
            pass
    else:
        pass
    
    #Query parameter switch block
    if str.lower(custom_params_switch) == 'yes':
        dataset = get_query_dataframe_cp(custom_params)
    else:
        if str.lower(geo_switch) == 'yes':
            lat = float(input("Input Latitude:")) #Converts string input latitude to float value
            long = float(input("Input Longitude:"))
            city = input("Input city or neighborhood corresponding to coordinates:") #Allows filling of city values
            radius = input("Input radius and unit:")
            dataset = get_query_dataframe_geo(query_list, city, lat, long, radius)
        else:
            dataset = get_query_dataframe(query_list)
    
    #CSV export block
    if str.lower(export_csv_switch) == 'yes':
        if os.path.exists('datasets') == True:
            pass
        else:
            os.mkdir('datasets')
        dataset.to_csv(f"./datasets/{custom_csv_name}.csv", index = False) #write csv to datasets folder
        print(f"Export complete, scraped {len(dataset.index)} tweets") #Prints completion statement including total tweets scraped
        
    else:
        return dataset        

In [32]:
get_dataset()

Are you using a custom parameter dictionary? no
Do you want to export the final dataframe to csv? yes
Are you using geolocation? no


INFO: queries: ['COVID since:2019-12-01 until:2019-12-08', 'COVID since:2019-12-08 until:2019-12-16', 'COVID since:2019-12-16 until:2019-12-24', 'COVID since:2019-12-24 until:2020-01-01', 'COVID since:2020-01-01 until:2020-01-09', 'COVID since:2020-01-09 until:2020-01-17', 'COVID since:2020-01-17 until:2020-01-25', 'COVID since:2020-01-25 until:2020-02-02', 'COVID since:2020-02-02 until:2020-02-10', 'COVID since:2020-02-10 until:2020-02-18', 'COVID since:2020-02-18 until:2020-02-25', 'COVID since:2020-02-25 until:2020-03-04', 'COVID since:2020-03-04 until:2020-03-12', 'COVID since:2020-03-12 until:2020-03-20', 'COVID since:2020-03-20 until:2020-03-28', 'COVID since:2020-03-28 until:2020-04-05', 'COVID since:2020-04-05 until:2020-04-13', 'COVID since:2020-04-13 until:2020-04-21', 'COVID since:2020-04-21 until:2020-04-29', 'COVID since:2020-04-29 until:2020-05-07']
INFO: Got 1 tweets (1 new).
INFO: Got 2 tweets (1 new).
INFO: Got 3 tweets (1 new).
INFO: Got 6 tweets (3 new).
INFO: Got 24

Input CSV export file name: 2


Export complete, scraped 3526 tweets
