In [1]:
import pandas as pd
import numpy as np
import json
import tweepy

In [2]:
import config

In [3]:
# training df of tweet ids and labels 
df = pd.read_csv('../all_combined/all_train.tsv', sep='\t')
df.shape

(53531, 2)

In [4]:
df_train = df.copy()

In [5]:
# to avoid scientific notation 
# df_train['string_ids'] = df['tweet_id'].astype('string')

In [6]:
# list of ids to collect text, author id and created at date
tweet_ids = list(df_train['tweet_id'].astype('string'))

## Tweepy Client

In [7]:
bearer = config.Bearer_token

In [8]:
client = tweepy.Client(bearer_token=bearer, wait_on_rate_limit=True)

## Get the Text

In [9]:
def retrieve_text(index_list):
    list_of_tweets = []
    try:
        tweets = client.get_tweets(ids = index_list, tweet_fields = ["author_id", "created_at"])
        for tweet in tweets.data:
            current_tweet = {
                'tweet_id': tweet.id,
                'text': tweet.text,
                'author_id': tweet.author_id,
                'created_at': tweet.created_at
            }
            current_df = pd.DataFrame([current_tweet])
            list_of_tweets.append(current_df)
        df = pd.concat(list_of_tweets)
        return df

    except Error as e:
        print("Error:", e)
        return False

In [10]:
def collect_tweet_dfs(index_list, start_twt):
    end_twt = start_twt + 100

    tweet_dfs = []

    working = True

    while working: 
        try:
            batch = index_list[start_twt: end_twt]
            df =  retrieve_text(batch)
            tweet_dfs.append(df)
            start_twt += 100
            end_twt += 100

        except Exception as e:
            working = False
            print(end_twt)
            print(e)
                 
    
    total_dfs = pd.concat(tweet_dfs)
    return total_dfs

In [11]:
# %%time
# df_text = collect_tweet_dfs(tweet_ids, 0)

In [12]:
# df_text.head()

In [13]:
# df_text.shape

In [14]:
# df_text.to_csv('up_to_9400.csv')

In [15]:
# %%time
# df_text_2 = collect_tweet_dfs(tweet_ids, 9400)

In [16]:
# last index number 39500
# df_text_2.to_csv('from_9400_to_39500.csv')

In [17]:
# df_text_2.shape

In [18]:
# %%time
# df_text_3 = collect_tweet_dfs(tweet_ids, 39500)

In [19]:
# df_text_3.shape

In [20]:
# df_text_3.to_csv('from_39500_to_53700.csv')

## Combine the text DataFrames

In [21]:
df1 = pd.read_csv('up_to_9400.csv')
df2 = pd.read_csv('from_9400_to_39500.csv')
df3 = pd.read_csv('from_39500_to_53700.csv')

In [22]:
df_all = pd.concat([df1, df2, df3])
df_all.shape

(37469, 5)

In [23]:
df_main = df_train.merge(df_all, on = 'tweet_id', how = 'outer')
df_main.shape

(53531, 6)

In [24]:
# full collection of labels and text
df_main = df_main.drop('Unnamed: 0', axis = 1)
df_main.head()

Unnamed: 0,tweet_id,class_label,text,author_id,created_at
0,721872405916856321,injured_or_dead_people,Powerful Ecuador quake kills at least 235: POR...,3334140000.0,2016-04-18 01:26:07+00:00
1,721920840800018432,rescue_volunteering_or_donation_effort,I'm at awe and saddened with the #EcuadorEarth...,25196050.0,2016-04-18 04:38:35+00:00
2,721851949344256000,sympathy_and_support,,,
3,721720535030304769,injured_or_dead_people,RT @noticias2000: Ecuador quake death toll has...,43356230.0,2016-04-17 15:22:39+00:00
4,721719197429035008,infrastructure_and_utility_damage,,,


In [25]:
df_main.to_csv('all_tweets_with_nan.csv')

In [26]:
# from merges of text df = 37469
# from original df = 53531

In [27]:
df_final = df_main.dropna()
df_final.shape

(37469, 5)

In [35]:
df_final.head()

Unnamed: 0,tweet_id,class_label,text,author_id,created_at
0,721872405916856321,injured_or_dead_people,Powerful Ecuador quake kills at least 235: POR...,3334140000.0,2016-04-18 01:26:07+00:00
1,721920840800018432,rescue_volunteering_or_donation_effort,I'm at awe and saddened with the #EcuadorEarth...,25196050.0,2016-04-18 04:38:35+00:00
3,721720535030304769,injured_or_dead_people,RT @noticias2000: Ecuador quake death toll has...,43356230.0,2016-04-17 15:22:39+00:00
6,722065419276787712,rescue_volunteering_or_donation_effort,RT @thejpc: Israeli relief org already in Japa...,292382600.0,2016-04-18 14:13:05+00:00
7,722204361682743296,rescue_volunteering_or_donation_effort,RT @SamaritansPurse: We are rushing aid to hel...,309963000.0,2016-04-18 23:25:12+00:00


In [28]:
df_final.to_csv('all_tweets_no_nan.csv')

In [34]:
# maximum and minimum of tweet text lengths
list_text = list(df_final['text'])
tweet_lengths = [ len(x) for x in list_text]
max(tweet_lengths), min(tweet_lengths)

(890, 27)