# Imports

In [1]:
import numpy as np
import pandas as pd

from datetime import datetime

import twint

# Fixes runtime errors with twint
import nest_asyncio
nest_asyncio.apply()

from textblob import TextBlob

import matplotlib
%matplotlib inline

# Scrape Tweets

In [2]:
def twint_search(search, username=None, since=None, until=None, drop_cols=None, limit=None):
    '''
    Function to return a pandas dataframe of tweets in English containing search terms using twint.
    Required parameter: search term.
    Optional parameters: username, start date (since) and end date (until) to search, columns to drop, maximum number of tweets (limit).
    '''
    c = twint.Config()
    c.Lang = 'en'
    c.Search = search
    c.Username = username
    c.Since = since
    c.Until = until
    c.Limit = limit
    c.Pandas = True
    # Hide the printing of every tweet during scrape
    c.Hide_output = True
    twint.run.Search(c)
    df = twint.storage.panda.Tweets_df
    # Transform date string into datetime object
    df['date'] = pd.to_datetime(df['date']).dt.date
    return df

In [3]:
def search_loop(start_date, end_date, search, filename, username=None, drop_cols=None, limit=None):
    '''
    Function to loop over date range and perform twint_search function for each day, returning one combined dataframe.
    Periodically saves progress to CSV after each daily search.
    Required parameters: start date, end date, search term.
    Optional parameters: username, columns to drop, maximum number of tweets per day (limit).
    '''
    df = pd.DataFrame()
    date_range = pd.Series(pd.date_range(start_date, end_date))
    for d in range(len(date_range) - 1):
        since = date_range[d].strftime('%Y-%m-%d')
        until = date_range[d + 1].strftime('%Y-%m-%d')
        day_df = twint_search(search=search, username=username, since=since, until=until, drop_cols=drop_cols, limit=limit)
        # Drop empty columns
        day_df.drop(columns=drop_cols, axis=1, inplace=True)
        # Add new daily data to dataframe, reset index, save to CSV
        df = pd.concat([df, day_df])
        del day_df
        df.reset_index(drop=True, inplace=True)
        df.to_csv(f'Datasets/{filename}.csv')
    return df

In [8]:
# Search loop params for January tweets

start_date = '2020-01-01'

end_date = '2020-02-01'

search = '(mask OR masks OR facemask OR facemasks)'

filename = 'january_tweets'

drop_cols = ['timezone', 'place', 'cashtags', 'user_id_str', 'day', 'hour', 'search', 'near', 'geo',
             'source', 'user_rt_id', 'user_rt', 'retweet_date', 'translate', 'trans_src', 'trans_dest']

In [9]:
%time jan_df = search_loop(start_date=start_date, end_date=end_date, search=search, filename=filename, drop_cols=drop_cols, limit=50)

CPU times: user 28.5 s, sys: 181 ms, total: 28.7 s
Wall time: 1min 11s


In [10]:
len(jan_df)

1860

In [11]:
jan_df.head()

Unnamed: 0,id,conversation_id,created_at,date,tweet,hashtags,user_id,username,name,link,retweet,nlikes,nreplies,nretweets,quote_url,retweet_id,reply_to
0,1212523886329380865,1212523886329380865,1577923196000,2020-01-01,SpongeBob: slides down Sand Mountain\nThe tree...,[],1099209894295678977,Matt04181,Matt0417 (CEO of POOP),https://twitter.com/Matt04181/status/121252388...,False,4,0,0,,,"[{'user_id': '1099209894295678977', 'username'..."
1,1212523880558026753,1212523880558026753,1577923195000,2020-01-01,"Started 2020 with freshly washed bed sheets, a...",[],57777541,standtallx,Emma 🧛🏻‍♀️,https://twitter.com/standtallx/status/12125238...,False,3,2,0,,,"[{'user_id': '57777541', 'username': 'standtal..."
2,1212523828888449024,1212421340281614338,1577923183000,2020-01-01,Using the coverage of his yellow card to mask ...,"[#classicjose, #coverup]",346609834,5_Times_LFC,6 Times LFC,https://twitter.com/5_Times_LFC/status/1212523...,False,0,0,0,,,"[{'user_id': '346609834', 'username': '5_Times..."
3,1212523807128186882,1212402852011937792,1577923177000,2020-01-01,The mask pic.twitter.com/vHi9pC9S62,[],1070005555777945600,kagura_shelby,mugiwara no,https://twitter.com/kagura_shelby/status/12125...,False,3,0,0,,,"[{'user_id': '1070005555777945600', 'username'..."
4,1212523752925229056,1212512506838519811,1577923164000,2020-01-01,"I know what you mean, I've been depressed too ...",[#hugs],48659042,IBdaSweet1,Luwamba L.J.Taylor,https://twitter.com/IBdaSweet1/status/12125237...,False,1,0,0,,,"[{'user_id': '48659042', 'username': 'IBdaSwee..."


In [12]:
jan_df.keys()

Index(['id', 'conversation_id', 'created_at', 'date', 'tweet', 'hashtags',
       'user_id', 'username', 'name', 'link', 'retweet', 'nlikes', 'nreplies',
       'nretweets', 'quote_url', 'retweet_id', 'reply_to'],
      dtype='object')

# Analysis

In [41]:
# df['polarity'] = df['tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)
# df['subjectivity'] = df['tweet'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

In [42]:
# df['polarity'].describe()

In [43]:
# df['subjectivity'].describe()