In [1]:
import pandas as pd
import numpy as np
import json
import tweepy

In [2]:
import config

# Configure the initial hurricane datasets

In [3]:
# hurricane Mathew 2016
df1 = pd.read_csv('../DisasterTweets/ALL_Huricane/hurricane_matthew_2016/hurricane_matthew_2016_train.tsv', delimiter='\t')
df1.shape

(1157, 3)

In [4]:
df2 = pd.read_csv('../DisasterTweets/ALL_Huricane/hurricane_irma_2017/hurricane_irma_2017_train.tsv', delimiter='\t', error_bad_lines=False)
df2.shape



  df2 = pd.read_csv('../DisasterTweets/ALL_Huricane/hurricane_irma_2017/hurricane_irma_2017_train.tsv', delimiter='\t', error_bad_lines=False)


(6579, 3)

In [5]:
df3 = pd.read_csv('../DisasterTweets/ALL_Huricane/hurricane_maria_2017/hurricane_maria_2017_train.tsv', delimiter='\t', error_bad_lines=False)
df3.shape



  df3 = pd.read_csv('../DisasterTweets/ALL_Huricane/hurricane_maria_2017/hurricane_maria_2017_train.tsv', delimiter='\t', error_bad_lines=False)


(5094, 3)

In [6]:
df4 = pd.read_csv('../DisasterTweets/ALL_Huricane/hurricane_harvey_2017/hurricane_harvey_2017_train.tsv', delimiter='\t', error_bad_lines=False)
df4.shape



  df4 = pd.read_csv('../DisasterTweets/ALL_Huricane/hurricane_harvey_2017/hurricane_harvey_2017_train.tsv', delimiter='\t', error_bad_lines=False)


(6378, 3)

In [7]:
df = pd.concat([df1, df2, df3, df4], ignore_index=True)

In [8]:
# labels and ids
df_labels = df[['tweet_id', 'class_label']]
df_labels.shape

(19208, 2)

In [9]:
index_list = list(df_labels['tweet_id'])
len(index_list)

19208

# Load the additional Tweet features

In [10]:
client = tweepy.Client( 
                bearer_token = config.Bearer_token,
                consumer_key = config.API_Key,
                consumer_secret = config.API_Secret,
                access_token = config.Access_Token,
                access_token_secret = config.Access_Token_Secret,
                wait_on_rate_limit=True
         )

In [11]:
tweet_fields = ["author_id", 
                "created_at", 
                "conversation_id", 
                "context_annotations",
                "edit_history_tweet_ids",
                "attachments",
                "entities",
                "in_reply_to_user_id",
#                "non_public_metrics",
#                "organic_metrics",
                 "possibly_sensitive",
#                 "promoted_metrics",
                 "public_metrics",
                 "lang",
                 "referenced_tweets",
                 "reply_settings",
                 "source"
                 ]

In [12]:
def retrieve_tweets(index_list):
    list_of_tweets = []
    try:
        tweets = client.get_tweets(ids = index_list, tweet_fields = tweet_fields)
        for tweet in tweets.data:
            current_tweet = {
                'tweet_id': tweet.id,
                'text': tweet.text,
                'author_id': tweet.author_id,
                'created_at': tweet.created_at or np.nan,
                'conversation_id': tweet.conversation_id or np.nan,
                'entities': tweet.entities or np.nan,
                'context_annotations': tweet.context_annotations or np.nan,
                "edit_history_tweet_ids": tweet.edit_history_tweet_ids or np.nan,
                "in_reply_to_user_id": tweet.in_reply_to_user_id or np.nan,
                "attachments": tweet.attachments or np.nan,
                
#                "non_public_metrics": tweet.non_public_metrics or np.nan,
#                "organic_metrics": tweet.organic_metrics or np.nan,  
#                "promoted_metrics": tweet.promoted_metrics or np.nan,
                
                "lang": tweet.lang or np.nan, 
                "possibly_sensitive": tweet.possibly_sensitive or np.nan,
                "public_metrics": tweet.public_metrics or np.nan,
                "referenced_tweets": tweet.referenced_tweets or np.nan,
                "reply_settings": tweet.reply_settings or np.nan,
                "source": tweet.source or np.nan 
            }
            current_df = pd.DataFrame([current_tweet])
            list_of_tweets.append(current_df)
        df = pd.concat(list_of_tweets)
        return df

    except Error as e:
        print("Error:", e)
        return False

In [13]:
def collect_tweet_dfs(index_list, start_twt):
    
    end_twt = start_twt + 100

    tweet_dfs = []

    working = True

    while working: 
        try:
            batch = index_list[start_twt: end_twt]
            
            df =  retrieve_tweets(batch)
            tweet_dfs.append(df)
            start_twt += 100
            end_twt += 100

        except Exception as e:
            working = False
            print(end_twt)
            print(e)
                 
    
    total_dfs = pd.concat(tweet_dfs)
    return total_dfs

In [14]:
# %%time
# df1 = collect_tweet_dfs(index_list, 0)

In [15]:
# df2 = df1.merge(df_labels, on = 'tweet_id', how = 'inner')
# df2.shape

In [17]:
# df2.to_csv('all_hurricanes_tweets_train.csv')