In [2]:
# Code source 
# https://python.plainenglish.io/scraping-tweets-with-tweepy-python-59413046e788

# Import the libraries
from tweepy import Stream
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
from datetime import datetime
import tweepy
import json
import pandas as pd
import csv
import re #regular expression
from textblob import TextBlob
import string
import preprocessor as p
import os
import time
import keys

# Authenticating Twitter API
# Obtain your Twitter credentials from your twitter developer account

consumer_key = keys.CONSUMER_KEY
consumer_secret = keys.CONSUMER_SECRET
access_key = keys.ACCESS_KEY
access_secret = keys.ACCESS_SECRET

# Pass your twitter credentials to tweepy via its OAuthHandler

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

In [6]:
## Automating Scraping
# Calls API every 15 minutes to prevent overcalling

# 1. define a for-loop
# 2. define search parameter
# 3. define date period
# 4. define no. of tweets to pull

def scraptweets(search_words, date_since, numTweets, numRuns, f_name, language, time_sleep):

    ## Arguments:
    # search_words -> define a string of keywords for this function to extract
    # date_since -> define a date from which to start extracting the tweets 
    # numTweets -> number of tweets to extract per run
    # numRun -> number of runs to perform in this program - API calls are limited to once every 15 mins, so each run will be 15 mins apart.
    ##
    
    # Define a pandas dataframe to store the date:
    db_tweets = pd.DataFrame(columns = ['tweet_id', 'username', 'acctdesc', 'location', 'following',
                                        'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts',
                                        'retweetcount', 'text', 'hashtags']
                                )
    # Define a for-loop to generate tweets at regular intervals
    for i in range(0, numRuns):
        # We will time how long it takes to scrape tweets for each run:
        start_run = time.time()
        
        # Collect tweets using the Cursor object
        # .Cursor() returns an object that you can iterate or loop over to access the data collected.
        # Each item in the iterator has various attributes that you can access to get information about each tweet
        # since=date_since, 
        tweets = tweepy.Cursor(api.search, q=search_words, lang=language, tweet_mode='extended').items(numTweets)

        # Store these tweets into a python list
        tweet_list = [tweet for tweet in tweets]

        # Obtain the following info (methods to call them out):
            # user.screen_name - twitter handle
            # user.description - description of account
            # user.location - where is he tweeting from
            # user.friends_count - no. of other users that user is following (following)
            # user.followers_count - no. of other users who are following this user (followers)
            # user.statuses_count - total tweets by user
            # user.created_at - when the user account was created
            # created_at - when the tweet was created
            # retweet_count - no. of retweets
            # (deprecated) user.favourites_count - probably total no. of tweets that is favourited by user
            # retweeted_status.full_text - full text of the tweet
            # tweet.entities['hashtags'] - hashtags in the tweet

        # Begin scraping the tweets individually:
        noTweets = 0

        for tweet in tweet_list:

            # Pull the values
            tweet_id = tweet.id
            username = tweet.user.screen_name
            acctdesc = tweet.user.description
            location = tweet.user.location
            following = tweet.user.friends_count
            followers = tweet.user.followers_count
            totaltweets = tweet.user.statuses_count
            usercreatedts = tweet.user.created_at
            tweetcreatedts = tweet.created_at
            retweetcount = tweet.retweet_count
            hashtags = tweet.entities['hashtags']

            try:
                text = tweet.retweeted_status.full_text
            except AttributeError:  # Not a Retweet
                text = tweet.full_text

            # Add the 11 variables to the empty list - ith_tweet:
            ith_tweet = [tweet_id, username, acctdesc, location, following, followers, totaltweets,
                         usercreatedts, tweetcreatedts, retweetcount, text, hashtags]

            # Append to dataframe - db_tweets
            db_tweets.loc[len(db_tweets)] = ith_tweet

            # increase counter - noTweets  
            noTweets += 1
        
        # Run ended:
        end_run = time.time()
        duration_run = round(end_run-start_run, 2)
        
        #timestamp
        run_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S')
        
        print('run {} start time: {}'.format(i, run_timestamp))
        print('no. of tweets scraped for run {} is {}'.format(i, noTweets))
        print('time take for {} run to complete is {}'.format(i, duration_run))
        
        if not (i == numRuns-1):
            time.sleep(time_sleep) #15 minute sleep time (default: 900)

              
    # Once all runs have completed, save them to a single csv file:    
    # Obtain timestamp in a readable format:
    to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S')

    # Define working path and filename
    path = os.getcwd()
    filename = path + '/data/' + to_csv_timestamp + f_name + "_noTweets=" + str(noTweets) + '.csv'

    # Store dataframe in csv with creation date timestamp
    db_tweets.to_csv(filename, index = 
                     False, encoding='utf-8-sig')

    print('Scraping completed!')

In [7]:
# Initialise these variables:

# To search by country
# place_id = api.geo_search(query="libya", granularity="country")[0].id
# search_words = "place:%s" % place_id

search_words = "social media break" 
date_since = "2021-08-03"
numTweets = 399
numRuns = 5
f_name = "_break_collection"
language = "en"
time_sleep = 900
# Call the function scraptweets
scraptweets(search_words, date_since, numTweets, numRuns, f_name, language, time_sleep)

run 0 start time: 20210811_160306
no. of tweets scraped for run 0 is 399
time take for 0 run to complete is 26.21
run 1 start time: 20210811_161925
no. of tweets scraped for run 1 is 399
time take for 1 run to complete is 79.39
run 2 start time: 20210811_163452
no. of tweets scraped for run 2 is 399
time take for 2 run to complete is 27.03
run 3 start time: 20210811_165017
no. of tweets scraped for run 3 is 399
time take for 3 run to complete is 24.86
run 4 start time: 20210811_170605
no. of tweets scraped for run 4 is 399
time take for 4 run to complete is 47.94
Scraping completed!


In [8]:
#"social media break OR break from social media OR disconnect to reconnect OR Digital detox  OR social media fast OR unplugging social media OR social media addiction OR time off social media OR social media detox" 