# Retrieve Data from Twitter API

## Import necessary libraries

In [1]:
import tweepy
import json
import pandas as pd
import numpy as np
import time

import twitter_credentials

## Use authentication credential to connect Twitter API

In [2]:
def twitter_api_authentication():
    """ authenticate with crediential and connect to Twitter API """
    
    auth = tweepy.OAuthHandler(twitter_credentials.CONSUMER_KEY, twitter_credentials.CONSUMER_SECRET)
    auth.set_access_token(twitter_credentials.ACCESS_TOKEN, twitter_credentials.ACCESS_TOKEN_SECRET)
    api = tweepy.API(auth)
    
    return api

In [3]:
api = twitter_api_authentication()

## Get the data from one tweet as an example


In [4]:
# get the latest tweet of Elon Musk using user_timeline

# the scree_name is the name start at @, eg: @elonmusk
tweet_list = api.user_timeline(screen_name = "elonmusk", count = 1, tweet_mode="extended")
print(type(tweet_list)) # return a ResultSet that cannot retrieve _json directly
print(type(tweet_list[0])) # return a Status object that can retrieve _json data

<class 'tweepy.models.ResultSet'>
<class 'tweepy.models.Status'>


In [5]:
# to see what's in it
tweet_list

[Status(_api=<tweepy.api.API object at 0x7fcf242f72e0>, _json={'created_at': 'Thu Mar 31 18:07:52 +0000 2022', 'id': 1509593334192517126, 'id_str': '1509593334192517126', 'full_text': '@DriveTeslaca Yes, but we take a very cautious approach to safety. As we confirm functionality on Canadian roads, the beta will expand.', 'truncated': False, 'display_text_range': [14, 135], 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'DriveTeslaca', 'name': 'Drive Tesla 🇨🇦', 'id': 1169348503975317504, 'id_str': '1169348503975317504', 'indices': [0, 13]}], 'urls': []}, 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'in_reply_to_status_id': 1509568826035224577, 'in_reply_to_status_id_str': '1509568826035224577', 'in_reply_to_user_id': 1169348503975317504, 'in_reply_to_user_id_str': '1169348503975317504', 'in_reply_to_screen_name': 'DriveTeslaca', 'user': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_na

In [6]:
# to see all the keys in the json dictionary
tweet_list[0]._json.keys()

dict_keys(['created_at', 'id', 'id_str', 'full_text', 'truncated', 'display_text_range', 'entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'lang'])

In [7]:
# retrieve the latest tweets 
tweet_list[0]._json['full_text']

'@DriveTeslaca Yes, but we take a very cautious approach to safety. As we confirm functionality on Canadian roads, the beta will expand.'

In [8]:
# retrieve the created time of the tweet
tweet_list[0].created_at

datetime.datetime(2022, 3, 31, 18, 7, 52, tzinfo=datetime.timezone.utc)

## Retrieve tweets data

In [9]:
# more API calls can be found in tweepy, https://docs.tweepy.org/en/latest/api.html

# get user timeline tweets
def get_user_timeline_tweets(user_handle, num_tweets):
    tweets = []
    for tweet in tweepy.Cursor(api.user_timeline, screen_name = user_handle, tweet_mode = 'extended').items(num_tweets):
        tweets.append(tweet)
        
    return tweets

# get followers
def get_followers(user_handle, num_tweets):
    followers = []
    for follower in tweepy.Cursor(api.get_followers, screen_name = user_handle).items(num_tweets):
        followers.append(follower)
    
    return followers

In [10]:
user_handle = 'elonmusk'
num_tweets = 3
tweets = get_user_timeline_tweets(user_handle, num_tweets)
print(len(tweets))

3


## Retrieve followers tweets data with rate limit handled

In [13]:
# get followers
def get_followers(user_handle, num_tweets):
    
    def limit_handled(cursor):
        """
        Handle twitter rate limits.
        If the rate limit is reached, print the error message and exit the procedure.
        Print a short summary of the total number of tweets retrieved and time spent.
        """
        n=0
        while True:
            print(".", end="")
            try:
                yield cursor.next()
                n += 1
            except Exception as e:
                if tweepy.TooManyRequests:  #if TRUE this means we hit a rate limit error
                    print(f"Reached rate limits after {n} iterations. Please wait 15 minutes for the next try.")   
                    print(f'Error message: {e}')
                else:
                    print("Some unknown reason occured, HEEEEELP!")
                break
        return None
            
    followers = []
  
    for follower in limit_handled(tweepy.Cursor(api.get_followers, screen_name = user_handle, tweet_mode = 'extended').items(num_tweets)):
        followers.append(follower)
    
    return followers

In [14]:
followers = get_followers(user_handle, 300)
print(len(followers))

.Reached rate limits after 0 iterations. Please wait 15 minutes for the next try.
Error message: 429 Too Many Requests
88 - Rate limit exceeded
0


## Retrieve large number of user timeline tweets

In [15]:
# support retrieve large number of tweets
def get_user_timeline_tweets_advanced(user_handle, num_tweets):
    """
    Retrieve num_tweets recent tweets of an user_handle
    Support repeatly retrieving tweets until reached the rate limit.
    Print a short summary of the total number of tweets retrieved and time spent.
    
    Return a list of retrieved tweets (each tweet is a tweepy Status object) 
    """

    tweets = []
    run_num = 0
    tweet_num = 0
    program_start = time.time()

    while len(tweets) < num_tweets:
        try:
            for status in tweepy.Cursor(api.user_timeline, screen_name = user_handle, tweet_mode = 'extended').items(num_tweets):
                if len(tweets) < num_tweets:
                    tweets.append(status)
                    tweet_num += 1
                else:
                    break
        except Exception as e:
            if tweepy.TooManyRequests:  #if TRUE this means we hit a rate limit error
                print(f"Reached rate limits after {n} iterations. Please sleep 15 minutes for the next try.")   
                print(f'Error message: {e}')
            else:
                print("Some unknown reason occured, HEEEEELP!")
                print(f'Error message: {e}')
            break
        run_num += 1
    
    # print a summary
    program_end = time.time()
    program_duration_run = round((program_end-program_start)/60, 2)
    print('-------------')
    print('Summary: {} tweets retrieved after {} runs, taking {} mins in total.'.format(tweet_num, run_num, program_duration_run))  
    
    return tweets

In [16]:
large_tweets = get_user_timeline_tweets_advanced('Oprah', 3000)
print(len(large_tweets))

-------------
Summary: 3000 tweets retrieved after 1 runs, taking 0.7 mins in total.
3000


## Save retrieved tweets data to a json file

In [17]:
# save retrieved tweets data as json object
def save_tweets_data_to_json_file(tweets, json_file_path):
    """ save retrieved tweets data to a json file
        return the json file name
    """
    # get all json files from Staus object of retrieved tweets 
    tweets_list = []
    for tweet in tweets:
        tweets_list.append(tweet._json)

    # write retrieved tweets to json file
    with open(json_file_path, 'w') as json_file:
        json.dump(tweets_list,json_file)
    

# read json file 
def read_json_file(json_file_path):
    """ read tweets data from the json file
        return tweets data in json format (dictionary)
    """
    with open(json_file_path) as json_file:
        tweets = json.load(json_file)
    
    return tweets

In [18]:
json_file_path = 'oprah_retrieved_tweets_data.json'
save_tweets_data_to_json_file(large_tweets, json_file_path)
json_tweets = read_json_file(json_file_path)
print(len(json_tweets))
# print(json_tweets[0])

3000


## Create a twitter dataframe 

In [19]:
def create_tweets_dataframe(tweets, key_list):
    """
    Create a dataframe from a given sequence of tweets, and the columns are the key_list.
    Each tweet can be a tweepy Status object or a json object (dictionary).
    """
    df_array = []  
    
    # if tweets are the data from tweets json file (dictionary object)
    if type(tweets[0]) is dict:
        for tweet in tweets:
            row=[]
            for key in key_list:
                row.append(tweet[key])
            df_array.append(row)
    
    # if tweets are the data directly retrieved from raw tweets (Status object)    
    else:    
         for tweet in tweets:
            row=[]
            for key in key_list:
                row.append(tweet._json[key])
            df_array.append(row)
      
    t_df = pd.DataFrame(df_array, columns = key_list)
    t_df['created_at'] = pd.to_datetime(t_df['created_at'])
    
    return t_df

In [20]:
tweets = get_user_timeline_tweets('elonmusk',20)
key_list =['created_at','full_text','retweet_count', 'favorite_count','lang']
tweets_df = create_tweets_dataframe(tweets, key_list)
tweets_df.head()

Unnamed: 0,created_at,full_text,retweet_count,favorite_count,lang
0,2022-03-31 18:07:52+00:00,"@DriveTeslaca Yes, but we take a very cautious...",104,1270,en
1,2022-03-31 10:16:14+00:00,@teslaownersSV @PPathole @SpaceX Will take of it,107,1861,en
2,2022-03-31 06:27:05+00:00,@teslaownersSV @SpaceX You may be in an area t...,315,6145,en
3,2022-03-30 23:16:54+00:00,@BillyM2k 🤣,712,24954,und
4,2022-03-30 08:37:09+00:00,@JohnnaCrider1 Ok,509,17208,und
