# Imports

In [1]:
import numpy as np
import pandas as pd

import twint

# Fixes runtime errors with twint
import nest_asyncio
nest_asyncio.apply()

import matplotlib
%matplotlib inline

# Scrape Tweets

In [2]:
def twint_search(search, username=None, since=None, until=None, limit=None):
    '''
    Function to return a pandas dataframe of tweets in English containing term using twint search method.
    Required parameter: search term.
    Optional parameters: username, start date (since) and end date (until) to search, maximum number of tweets (limit).
    '''
    c = twint.Config()
    c.Lang = 'en'
    c.Search = search
    c.Username = username
    c.Since = since
    c.Until = until
    c.Limit = limit
    c.Pandas = True
    # Hide the printing of every tweet during scrape
    c.Hide_output = True
    twint.run.Search(c)
    df = twint.storage.panda.Tweets_df
    return df

In [3]:
def search_loop(start_date, end_date, search, username=None, limit=None):
    '''
    Function to loop over date range and perform twint_search function for each day, returning one combined dataframe.
    Required parameters: start date, end date, search term.
    Optional parameters: username, maximum number of tweets per day (limit).
    '''
    df = pd.DataFrame()
    date_range = pd.Series(pd.date_range(start_date, end_date))
    for d in range(len(date_range) - 1):
        since = date_range[d].strftime('%Y-%m-%d')
        until = date_range[d + 1].strftime('%Y-%m-%d')
        day_df = twint_search(search=search, username=username, since=since, until=until, limit=limit)
        df = pd.concat([df, day_df])
        del day_df
    df.reset_index(drop=True, inplace=True)
    return df

In [4]:
df = search_loop('2020-01-01', '2020-02-01', search='(mask OR masks OR facemask OR facemasks)', limit=20)

In [5]:
len(df)

620

In [6]:
df.head()

Unnamed: 0,id,conversation_id,created_at,date,timezone,place,tweet,hashtags,cashtags,user_id,...,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1212523886329380865,1212523886329380865,1577923196000,2020-01-01 18:59:56,EDT,,SpongeBob: slides down Sand Mountain\nThe tree...,[],[],1099209894295678977,...,,,,,,"[{'user_id': '1099209894295678977', 'username'...",,,,
1,1212523880558026753,1212523880558026753,1577923195000,2020-01-01 18:59:55,EDT,,"Started 2020 with freshly washed bed sheets, a...",[],[],57777541,...,,,,,,"[{'user_id': '57777541', 'username': 'standtal...",,,,
2,1212523828888449024,1212421340281614338,1577923183000,2020-01-01 18:59:43,EDT,,Using the coverage of his yellow card to mask ...,"[#classicjose, #coverup]",[],346609834,...,,,,,,"[{'user_id': '346609834', 'username': '5_Times...",,,,
3,1212523807128186882,1212402852011937792,1577923177000,2020-01-01 18:59:37,EDT,,The mask pic.twitter.com/vHi9pC9S62,[],[],1070005555777945600,...,,,,,,"[{'user_id': '1070005555777945600', 'username'...",,,,
4,1212523752925229056,1212512506838519811,1577923164000,2020-01-01 18:59:24,EDT,,"I know what you mean, I've been depressed too ...",[#hugs],[],48659042,...,,,,,,"[{'user_id': '48659042', 'username': 'IBdaSwee...",,,,


# Clean Tweets

In [7]:
# filter_words = [
#     'boxing',
#     'fencing',
#     #'football',
#     'hockey',
#     'ski',
#     'skiing']