In [None]:
# Import all needed libraries

import os
import tweepy as tw                  # Python wrapper around Twitter API
from google.colab import drive  # to mount Drive to Colab notebook
import json
import csv
from datetime import date
from datetime import datetime
import time
import pandas as pd

In [None]:
# Connect Google Drive to Colab
drive.mount('/content/gdrive')

# Create a variable to store the data path on your drive
path = './gdrive/My Drive/TwitterData'

Mounted at /content/gdrive


In [None]:
consumer_key= '********************'
consumer_secret= '********************'
access_token= '********************'
access_token_secret= '********************'

In [None]:
# Connect to Twitter API using the secrets
auth = tw.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tw.API(auth, wait_on_rate_limit=True)

In [None]:
def scraptweets(search_words, date_since, numTweets, numRuns):

    ## Arguments:
    # search_words -> define a string of keywords for this function to extract
    # date_since -> define a date from which to start extracting the tweets 
    # numTweets -> number of tweets to extract per run
    # numRun -> number of runs to perform in this program - API calls are limited to once every 15 mins, so each run will be 15 mins apart.
    ##
    
    # Define a pandas dataframe to store the date:
    db_tweets = pd.DataFrame(columns = ['username', 'location', 'following',
                                        'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts',
                                        'retweetcount', 'text', 'hashtags']
                                )
    # Define a for-loop to generate tweets at regular intervals
    for i in range(0, numRuns):
        # We will time how long it takes to scrape tweets for each run:
        start_run = time.time()
        
        # Collect tweets using the Cursor object
        # .Cursor() returns an object that you can iterate or loop over to access the data collected.
        # Each item in the iterator has various attributes that you can access to get information about each tweet
        tweets = tw.Cursor(api.search, q=search_words, lang="en", since=date_since, tweet_mode='extended').items(numTweets)

        # Store these tweets into a python list
        tweet_list = [tweet for tweet in tweets]

        # Obtain the following info (methods to call them out):
            # user.screen_name - twitter handle
            # user.description - description of account
            # user.location - where is he tweeting from
            # user.friends_count - no. of other users that user is following (following)
            # user.followers_count - no. of other users who are following this user (followers)
            # user.statuses_count - total tweets by user
            # user.created_at - when the user account was created
            # created_at - when the tweet was created
            # retweet_count - no. of retweets
            # (deprecated) user.favourites_count - probably total no. of tweets that is favourited by user
            # retweeted_status.full_text - full text of the tweet
            # tweet.entities['hashtags'] - hashtags in the tweet

        # Begin scraping the tweets individually:
        noTweets = 0

        for tweet in tweet_list:

            # Pull the values
            username = tweet.user.screen_name
            acctdesc = tweet.user.description
            location = tweet.user.location
            following = tweet.user.friends_count
            followers = tweet.user.followers_count
            totaltweets = tweet.user.statuses_count
            usercreatedts = tweet.user.created_at
            tweetcreatedts = tweet.created_at
            retweetcount = tweet.retweet_count
            hashtags = tweet.entities['hashtags']

            try:
                text = tweet.retweeted_status.full_text
            except AttributeError:  # Not a Retweet
                text = tweet.full_text

            # Add the 10 variables to the empty list - ith_tweet:
            ith_tweet = [username, location, following, followers, totaltweets,
                         usercreatedts, tweetcreatedts, retweetcount, text, hashtags]

            # Append to dataframe - db_tweets
            db_tweets.loc[len(db_tweets)] = ith_tweet

            # increase counter - noTweets  
            noTweets += 1
        
        # Run ended:
        end_run = time.time()
        duration_run = round(end_run-start_run, 2)
        
        print('no. of tweets scraped for run {} is {}'.format(i, noTweets))
        print('time take for {} run to complete is {}'.format(i, duration_run))
        
        time.sleep(900) #15 minute sleep time

        
    # Once all runs have completed, save them to a single csv file:    
    # Obtain timestamp in a readable format:
    from datetime import datetime
    to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S')

    # Define working path and filename
    path = './gdrive/My Drive/TwitterData'
    filename = path + '/data/' + to_csv_timestamp + 'endsars_tweets.csv'

    # Store dataframe in csv with creation date timestamp
    db_tweets.to_csv(filename, index = False)
    
    print('Scraping has completed!')

In [None]:
# Define the search term and the date_since date as variables
search_words = "#EndSARS OR #Endpolicebrutality OR #Reformthepolice OR #Sorosoke OR #ReformSARS OR #LekkiMassacre OR #LekkiGenocide OR #AwkuzuSARS OR #EndSWAT OR #SARSMUSTEND"
date_since = "2020-10-01"
numTweets = 2500
numRuns = 5


In [None]:
# Call the function scraptweets
scraptweets(search_words, date_since, numTweets, numRuns)

no. of tweets scraped for run 0 is 2500
time take for 0 run to complete is 50.32
no. of tweets scraped for run 1 is 2500
time take for 1 run to complete is 55.1
no. of tweets scraped for run 2 is 2500
time take for 2 run to complete is 60.96
no. of tweets scraped for run 3 is 2500
time take for 3 run to complete is 65.32
no. of tweets scraped for run 4 is 2500
time take for 4 run to complete is 71.12
Scraping has completed!


In [None]:
# Collect 1000 tweets
tweets = tw.Cursor(api.search,
              q=search_words,
              lang="en",
              since=date_since).items(1000)

# Collect a list of tweets
all_tweets = [tweet.text for tweet in tweets]

# Return first 10 tweets
all_tweets[:10]

["But wait ooo, is @MBuhari @NigeriaGov @NGRPresident actually aware students haven't been going to school for 9 mont… https://t.co/iMw35JiDRg",
 'Over 70 killed during #EndSARS in Lagos unclaimed – SAN @realFFK @UKinNigeria @USinNigeria @AmnestyNigeria… https://t.co/eUuqkFgPZ9',
 'Yes...governmental pestilence!\n#EndSARS \n#EndBadGoveranceInNigeria https://t.co/t4ZGkt1hX5',
 '@sars_watch @Icon_Gadgets You’re getting this for your help during the #EndSARS moments \nStreet will never forget👍',
 '@SaharaReporters #EndSARS look at some of the craziest things we have experienced happening only in nigeria… https://t.co/FJP8e2OKkj',
 'This is a police officer here harrasing my younger sister for standing up against police brutality #EndSARS… https://t.co/brb6EvxxVW',
 '@vanguardngrnews Always Quick To Okay Bills About Billions and Their Pockets. \n\n#EndSARS  NDLEA | #NIMC | #MakeItHappen',
 "Thanks to @BBCAfrica's @briticoyemo @DukeU's @SFCDaly @NAMCHoutonTX's Kelechukwu Anyanwu, @DukeU's A

#### Text Cleanup

In [None]:
# Define a function that replaces the url in the tweets with nothing.
import re
def remove_url(txt):
    
        return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

In [None]:
#Call function to create clean texts
all_tweets_no_urls = [remove_url(tweet) for tweet in all_tweets]
all_tweets_no_urls[:10]

['But wait ooo is MBuhari NigeriaGov NGRPresident actually aware students havent been going to school for 9 mont',
 'Over 70 killed during EndSARS in Lagos unclaimed SAN realFFK UKinNigeria USinNigeria AmnestyNigeria',
 'Yesgovernmental pestilenceEndSARS EndBadGoveranceInNigeria',
 'sarswatch IconGadgets Youre getting this for your help during the EndSARS moments Street will never forget',
 'SaharaReporters EndSARS look at some of the craziest things we have experienced happening only in nigeria',
 'This is a police officer here harrasing my younger sister for standing up against police brutality EndSARS',
 'vanguardngrnews Always Quick To Okay Bills About Billions and Their Pockets EndSARS NDLEA NIMC MakeItHappen',
 'Thanks to BBCAfricas briticoyemo DukeUs SFCDaly NAMCHoutonTXs Kelechukwu Anyanwu DukeUs Abimbdu Adelakun',
 'instablog9ja Please yall should help view and listen to my track titled Blank Surface I did it for the',
 'Its not titles than honour men but men that honour title

In [None]:
#Addressing case issues for text

# Create a list of lists containing lowercase words for each tweet
words_in_tweet = [tweet.lower().split() for tweet in all_tweets_no_urls]
words_in_tweet[:2]

[['but',
  'wait',
  'ooo',
  'is',
  'mbuhari',
  'nigeriagov',
  'ngrpresident',
  'actually',
  'aware',
  'students',
  'havent',
  'been',
  'going',
  'to',
  'school',
  'for',
  '9',
  'mont'],
 ['over',
  '70',
  'killed',
  'during',
  'endsars',
  'in',
  'lagos',
  'unclaimed',
  'san',
  'realffk',
  'ukinnigeria',
  'usinnigeria',
  'amnestynigeria']]

In [None]:
# Calculating and plotting word frequency

import itertools
import collections

# List of all words across tweets
all_words_no_urls = list(itertools.chain(*words_in_tweet))

# Create counter
counts_no_urls = collections.Counter(all_words_no_urls)

counts_no_urls.most_common(15)

[('endsars', 557),
 ('the', 548),
 ('to', 314),
 ('of', 283),
 ('in', 240),
 ('is', 237),
 ('and', 219),
 ('we', 182),
 ('a', 182),
 ('this', 174),
 ('for', 167),
 ('you', 156),
 ('are', 128),
 ('have', 120),
 ('that', 107)]