# Fetching Tweets for Training Data from Twitter API

## Twitter API Premium v1.1: Search Tweets 30-Day and Tweet Counts Endpoints

In [1]:
# Import dependencies
import tweepy
import pandas as pd
import json

In [2]:
# Import API client token
import os
from dotenv import load_dotenv
load_dotenv()

twit_token = os.getenv('twit_token')

In [95]:
auth = tweepy.OAuth2BearerHandler(twit_token)
api = tweepy.API(auth, wait_on_rate_limit=True)

In [447]:
# Code reproduced from Twitter developer API documentation [4]
def determine_tweet_type(tweet):
    # Check for reply indicator first
    if tweet["in_reply_to_status_id"] is not None:
        tweet_type = "Reply Tweet"
    # Check boolean quote status field but make sure it's not a Retweet (of a Quote Tweet) 
    elif tweet["is_quote_status"] is True and not tweet["text"].startswith("RT"):
        tweet_type = "Quote Tweet"
    # Check both indicators of a Retweet
    elif tweet["text"].startswith("RT") and tweet.get("retweeted_status") is not None:
        tweet_type = "Retweet"
    else:
        tweet_type = "Original Tweet"
    return tweet_type

parsedTweets = []
rawTweets = []

def parse_tweets(status):
    for tweet in status:
        
        rawTweets.append(tweet._json)
        hashtags = []
        mentions = []
        
        if determine_tweet_type(tweet._json) == 'Retweet':
            if 'extended_tweet' in tweet._json['retweeted_status']:
                full_text = tweet._json['retweeted_status']['extended_tweet']['full_text']
                
                # Collect hashtags from tweet into list
                if len(tweet._json['retweeted_status']["extended_tweet"]["entities"]["hashtags"]) == 0:
                    hashtags = None
                else:
                    for hashtag in range(len(tweet._json['retweeted_status']["extended_tweet"]["entities"]["hashtags"])):
                        hashtags.append("#" + tweet._json['retweeted_status']["extended_tweet"]["entities"]["hashtags"][hashtag]['text'])
                
                # Collect mentions from tweet into list
                if len(tweet._json['retweeted_status']["extended_tweet"]["entities"]["user_mentions"]) == 0:
                    mentions = None
                else:
                    for mention in range(len(tweet._json['retweeted_status']["extended_tweet"]["entities"]["user_mentions"])):
                        mentions.append("@" + tweet._json['retweeted_status']["extended_tweet"]["entities"]["user_mentions"][mention]["screen_name"])
            else:
                full_text = tweet._json['retweeted_status']['text']
                
                # Collect hashtags from tweet into list
                if len(tweet._json['retweeted_status']["entities"]["hashtags"]) == 0:
                    hashtags = None
                else:
                    for hashtag in range(len(tweet._json['retweeted_status']["entities"]["hashtags"])):
                        hashtags.append("#" + tweet._json['retweeted_status']["entities"]["hashtags"][hashtag]['text'])
                
                # Collect mentions from tweet into list
                if len(tweet._json['retweeted_status']["entities"]["user_mentions"]) == 0:
                    mentions = None
                else:
                    for mention in range(len(tweet._json['retweeted_status']["entities"]["user_mentions"])):
                        mentions.append("@" + tweet._json['retweeted_status']["entities"]["user_mentions"][mention]["screen_name"])
               

        elif determine_tweet_type(tweet._json) == 'Quote Tweet':
            if 'extended_tweet' in tweet._json['quoted_status']:
                full_text = tweet._json['quoted_status']['extended_tweet']['full_text']
                
                # Collect hashtags from tweet into list
                if len(tweet._json['quoted_status']["extended_tweet"]["entities"]["hashtags"]) == 0:
                    hashtags = None
                else:
                    for hashtag in range(len(tweet._json['quoted_status']["extended_tweet"]["entities"]["hashtags"])):
                        hashtags.append("#" + tweet._json['quoted_status']["extended_tweet"]["entities"]["hashtags"][hashtag]['text'])
                
                # Collect mentions from tweet into list
                if len(tweet._json['quoted_status']["extended_tweet"]["entities"]["user_mentions"]) == 0:
                    mentions = None
                else:
                    for mention in range(len(tweet._json['quoted_status']["extended_tweet"]["entities"]["user_mentions"])):
                        mentions.append("@" + tweet._json['quoted_status']["extended_tweet"]["entities"]["user_mentions"][mention]["screen_name"])
                    
            else:
                full_text = tweet._json['quoted_status']['text']
                
                # Collect hashtags from tweet into list
                if len(tweet._json['quoted_status']["entities"]["hashtags"]) == 0:
                    hashtags = None
                else:
                    for hashtag in range(len(tweet._json['quoted_status']["entities"]["hashtags"])):
                        hashtags.append("#" + tweet._json['quoted_status']["entities"]["hashtags"][hashtag]['text'])
                
                # Collect mentions from tweet into list
                if len(tweet._json["entities"]["user_mentions"]) == 0:
                    mentions = None
                else:
                    for mention in range(len(tweet._json["entities"]["user_mentions"])):
                        mentions.append("@" + tweet._json["entities"]["user_mentions"][mention]["screen_name"])
                
        else:
            if 'extended_tweet' in tweet._json:
                full_text = tweet._json['extended_tweet']['full_text']
                
                # Collect hashtags from tweet into list
                if len(tweet._json["extended_tweet"]["entities"]["hashtags"]) == 0:
                    hashtags = None
                else:
                    for hashtag in range(len(tweet._json["extended_tweet"]["entities"]["hashtags"])):
                        hashtags.append("#" + tweet._json["extended_tweet"]["entities"]["hashtags"][hashtag]['text'])
                
                # Collect mentions from tweet into list
                if len(tweet._json["extended_tweet"]["entities"]["user_mentions"]) == 0:
                    mentions = None
                else:
                    for mention in range(len(tweet._json["extended_tweet"]["entities"]["user_mentions"])):
                        mentions.append("@" + tweet._json["extended_tweet"]["entities"]["user_mentions"][mention]["screen_name"])
                    
            else:
                full_text = tweet._json['text']
                
                # Collect hashtags from tweet into list
                if len(tweet._json["entities"]["hashtags"]) == 0:
                    hashtags = None
                else:
                    for hashtag in range(len(tweet._json["entities"]["hashtags"])):
                        hashtags.append("#" + tweet._json["entities"]["hashtags"][hashtag]['text'])
                
                # Collect mentions from tweet into list
                if len(tweet._json["entities"]["user_mentions"]) == 0:
                    mentions = None
                else:
                    for mention in range(len(tweet._json["entities"]["user_mentions"])):
                        mentions.append("@" + tweet._json["entities"]["user_mentions"][mention]["screen_name"])
        

        # Get coordinates latitude and longitude into separate variables by subscripting coordinates if coordinates object not Nonetype
        coordinates = tweet._json["coordinates"]
        if coordinates is not None:
            coord_lat = tweet._json["coordinates"]["coordinates"][0]
            coord_lng = tweet._json["coordinates"]["coordinates"][1]
        else:
            coord_lat = None
            coord_lng = None
            
        
        # Get place attributes into separate variables by subscripting place if place object not Nonetype 
        place = tweet._json["place"]
        if place is not None:
            place_type = tweet._json["place"]["place_type"]
            place_name = tweet._json["place"]["name"]
            place_full_name = tweet._json["place"]["full_name"]
            country_code = tweet._json["place"]["country_code"]
            country = tweet._json["place"]["country"]
        else:
            place_type = None
            place_name = None
            place_full_name = None
            country_code = None
            country = None
            

            
        
        mydict = { "tweet_id": tweet._json["id_str"], #Tweet ID
                       "date":tweet._json["created_at"], #Timestamp of tweet creation
                       "full_text": full_text, #Full tweet text
                       "tweet_type": determine_tweet_type(tweet._json), #Type of tweet
                       "hashtags": hashtags, #List of hashtags used in the tweet
                       "mentions": mentions, #List of mentions used in the tweet
                       "user_location": tweet._json["user"]["location"], #The user's listed location
                       "geo": tweet._json["geo"], #Geodata 'geo' attribute
                       "lat_coordinates": coord_lat, #Geodata 'latitude coordinates' from 'coordinates' attribute
                       "lng_coordinates": coord_lng, #Geodata 'longitude coordinates' from 'coordinates' attribute
                       "place_type": place_type, #Geodata 'place_type' from 'place' attribute
                       "place_name": place_name, #Geodata 'name' from 'place' attribute
                       "place_full_name": place_full_name, #Geodata 'full_name' from 'place' attribute
                       "country_code": country_code, #Geodata 'country_code' from 'place' attribute
                       "country": country, #Geodata 'country' from 'place' attribute
                       "reply_count": tweet._json["reply_count"], #Number of times Tweet has been replied to
                       "quote_count": tweet._json["quote_count"], # Number of times Tweet has been quoted
                       "likes_count": tweet._json["favorite_count"], #Number of times Tweet has been liked 
                       "retweet_counts": tweet._json["retweet_count"], #Number of times this Tweet has been retweeted
                       "hyperlink": "https://twitter.com/twitter/status/" + tweet._json["id_str"] #Link to tweet
              }
        
        parsedTweets.append(mydict) # Add Tweet to parsedTweets list

In [208]:
query = '#guncontrol OR #gunrights OR #gunsense OR #2a OR #gunviolence OR #nra OR #guns OR #shooting OR #firearms OR #gunsafety OR #uvalde \
        OR "gun control" OR "2nd amendment" OR "gun violence" OR "right to bear" OR "nra" OR "gun safety" OR "gun rights" OR "good guy with a gun" \
        OR "assault weapons" OR "ban guns" OR "ban assault" OR "school shootings" OR "ar-15" OR "mass Shootings" lang:en -is:retweet -is:quote lang:en'
    
rawTweets = []
parsedTweets = []

status = tweepy.Cursor(api.search_30_day, label='tweetsentiment', query=query, fromDate='202211030000', toDate='202211040000').items(100)

parse_tweets(status)


# convert raw tweets json responses array into .json file
with open("../res/training_tweets_raw.json", "w") as training_tweets:
    json.dump(rawTweets, training_tweets)
    
    
tweet_df = pd.DataFrame(parsedTweets)

In [209]:
tweet_df.info()

'𝐅𝐢𝐫𝐬𝐭 𝐍𝐚𝐭𝐢𝐨𝐧𝐬 𝐂𝐡𝐢𝐞𝐟 𝐓𝐞𝐫𝐫𝐲 𝐓𝐞𝐞𝐠𝐞𝐞 𝐞𝐱𝐩𝐫𝐞𝐬𝐬𝐞𝐬 𝐜𝐨𝐧𝐜𝐞𝐫𝐧𝐬 𝐰𝐢𝐭𝐡 𝐂-𝟐𝟏\nRegional Chief @Terry_Teegee of @BCAFN had concerns #C21 could restrict access to guns historically used by First Nations #cdnpoli #guncontrol\nhttps://t.co/vmSOcIUWSI'