In [1]:
import tweepy
from twitter_api_xanda import TWITTER_API_KEY, TWITTER_API_SECRET, TWITTER_API_BEARER
import pandas as pd
import datetime

In [2]:
company_handle = "yumbrands"
FIELDS = ["created_at", "text", "public_metrics", "context_annotations", "entities", "referenced_tweets", "author_id"]

In [3]:
client = tweepy.Client(bearer_token=TWITTER_API_BEARER)

user = client.get_user(username=company_handle)
tweets = client.get_users_tweets(id=user.data.id, tweet_fields=FIELDS)

In [4]:
def parse_referenced_tweets(tweet):
    referenced_tweets = set()
    if not tweet.referenced_tweets:
        return None
    
    for obj in tweet.referenced_tweets:
        tweet_id = obj['id']
        tweet_type = obj['type']
        
        referenced_tweets.add((tweet_id, tweet_type))

    return referenced_tweets

def parse_entity_annotations(tweet):
    info_tuples = set()
    if "annotations" not in tweet.entities:
        return None
    
    for obj in tweet.entities['annotations']:
        annotation_type = obj['type']
        annotation_text = obj['normalized_text']
        
        info_tuples.add((annotation_type, annotation_text))

    return info_tuples

def parse_entity_hashtags(tweet):
    hashtags = set()
    if "hashtags" not in tweet.entities:
        return None
    
    for obj in tweet.entities['hashtags']:
        hashtags.add(obj['tag'])
        
    return hashtags

def parse_context_annotations(tweet):
    info_tuples = set()
    for obj in tweet.context_annotations:
        domain_id = obj['domain']['id']
        domain_name = obj['domain']['name']
        entity_name = obj['entity']['name']
        
        info_tuples.add((domain_id, domain_name, entity_name))
    
    if len(info_tuples) == 0:
        return None
        
    return info_tuples

In [5]:
tweet = tweets.data[2]
metrics = tweet.public_metrics

tweet_id = tweet.id
created_at = str(tweet.created_at)
text = tweet.text
like_count = metrics["like_count"]
reply_count = metrics["reply_count"]
retweet_count = metrics["retweet_count"]
referenced_tweets = parse_referenced_tweets(tweet)
context_annotations = parse_context_annotations(tweet)
hashtags = parse_entity_hashtags(tweet)
entities = parse_entity_annotations(tweet)

In [6]:
def parse_tweet_data(username, tweet, is_quoted_tweet = False):
    if not username:
        user = client.get_user(id=tweet.author_id)
        username = user.data.username
    tweet_id = tweet.id
    created_at = str(tweet.created_at)
    text = tweet.text
    hashtags = parse_entity_hashtags(tweet)
    like_count = metrics["like_count"]
    reply_count = metrics["reply_count"]
    retweet_count = metrics["retweet_count"]
    is_reply = False
    referenced_tweets = parse_referenced_tweets(tweet)
    if referenced_tweets:
        for reference in referenced_tweets:
            if reference[1] == "replied_to":
                is_reply = True
    context_annotations = parse_context_annotations(tweet)
    entities = parse_entity_annotations(tweet)
    
    if is_reply:
        # Don't include replies
        return None
    
    return [username, is_quoted_tweet, tweet_id, created_at, text, hashtags, like_count, reply_count, retweet_count, referenced_tweets, context_annotations, entities]

In [7]:
COLUMNS = ["username", "is_quoted_tweet", "tweet_id", "created_at", "text", "hashtags", "like_count", "reply_count", "retweet_count", "referenced_tweets", "context_annotations", "entity_annotations"]
TWEET_COUNT = 100

def get_tweets_for_user(username: str):
    user = client.get_user(username=username)
    
    end_time = datetime.datetime.now(datetime.timezone.utc).replace(microsecond=0)
    one_year_delta = datetime.timedelta(days=365)
    end_time_formatted = str(end_time.isoformat())
    start_time_formatted = str((end_time - one_year_delta).isoformat())
    
    rows = []
    quoted_tweet_ids = set()
    
    # Get tweets for username
    while True:
        tweets = client.get_users_tweets(id=user.data.id, tweet_fields=FIELDS, end_time=end_time_formatted, start_time=start_time_formatted, max_results=TWEET_COUNT)

        if not tweets.data:
            # No more tweets btwn start_time and original end_time
            break
            
        for tweet in tweets.data:
            parsed_tweet = parse_tweet_data(username, tweet)
            if parsed_tweet:
                rows.append(parsed_tweet)
                
                # If quote tweet, add ID of quoted tweet to quoted_tweet_ids set
                referenced_tweets = parsed_tweet[9]
                if referenced_tweets:
                    for reference in referenced_tweets:
                        tweet_id, tweet_type = reference
                        if tweet_type == "quoted":
                            quoted_tweet_ids.add(tweet_id)

        # Update end_time to created_at time of last tweet for next 100 tweets
        last_tweet_time = rows[-1][3]   # list representing last tweet is at rows[-1], created_at is at index 3 of that list
        end_time_formatted = "T".join(last_tweet_time.split())
    
    # Get quoted tweets
    for tweet_id in quoted_tweet_ids:
        tweet = client.get_tweet(id=tweet_id, tweet_fields=FIELDS)
        parsed_tweet = parse_tweet_data(None, tweet.data, is_quoted_tweet = True)
        if parsed_tweet:
            rows.append(parsed_tweet)
        
    df = pd.DataFrame(rows, columns=COLUMNS)
    
    return df

In [8]:
yum_tweets_df = get_tweets_for_user("yumbrands")

yum_tweets_df.to_csv("yum_tweets.csv")