In [1]:
import pandas as pd
import numpy as np

In [2]:
import tweepy
import json
import datetime
import random

In [3]:
import config
import request_history

In [4]:
# assumed start date for concern for Huricanne Ian 1 Sept 2022 -- planning factor
date_ = '2022-09-01T00:00:00.000Z'

# last reporting of casualties from Hurricane Ian: 1 Nov 2022


In [5]:
# hurricane terms adopted from the Stowe paper: 
query_terms = "Category 3 OR Category 4 OR landfall OR eyewall OR cleanup OR debris OR garbage OR hurricane OR hurricaneian OR perfectstorm OR Ian OR stormporn OR superstorm OR Tropical Storm"

In [6]:
# update dates iterates by 5 days + random hour selection - 500 tweets filled up in minutes
# so a random hour selection was used to interject variability in the time selection 

def generate_dates(old_end):
    """
    updates the start and end dates to pull tweets from the Twitter API (Academic Permissions)
    arguments:
        - old_end: string formated date to start the window for pulling set of 500 tweets
        
    return:
        - new_start: the previous end date
        - new_end: 5 days from the old end, at a random hour
    """
    hour_increment = random.randint(0, 23)
    new_start = old_end

    old_date = datetime.datetime.strptime(old_end, '%Y-%m-%dT%H:%M:%S.%fZ')
    new_end_dtg = old_date + datetime.timedelta(days=3, hours=hour_increment)
    new_end = datetime.datetime.strftime(new_end_dtg, '%Y-%m-%dT%H:%M:%S.%fZ')

    return new_start, new_end

In [7]:
def next_batch(previous_end, query_terms=query_terms, generate_dates=generate_dates):
    # request 500 tweets at a time
    time_now = datetime.datetime.now()
    print(f"Time of most recent request: {time_now.hour}:{time_now.minute}")

    start, end = generate_dates(previous_end)
    tweet_batch = request_history.main(query_terms, start, end)

    return end, tweet_batch

In [8]:
def create_tweets_df(raw_list):
    list_of_tweets = []
    for tweet in raw_list:
        current_df = pd.DataFrame([tweet])
        list_of_tweets.append(current_df)
    df = pd.concat(list_of_tweets, ignore_index=True)
    return df

In [9]:
all_dfs = []
for _ in range(18):
    try:
        date_, tweets = next_batch(date_)
        df = create_tweets_df(tweets['data'])
        all_dfs.append(df)
    except Exception as e:
        print("Final date: ", date_)
        print("API errors: ", e)
        break

Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200
Time of most recent request: 22:51
200


In [11]:
final_df = pd.concat(all_dfs, ignore_index=True)
final_df.shape

(8654, 16)

In [12]:
date_

'2022-10-30T21:00:00.000000Z'

In [17]:
cols = list(final_df.columns)
cols

['public_metrics',
 'text',
 'reply_settings',
 'entities',
 'possibly_sensitive',
 'edit_history_tweet_ids',
 'source',
 'lang',
 'referenced_tweets',
 'conversation_id',
 'created_at',
 'author_id',
 'id',
 'in_reply_to_user_id',
 'attachments',
 'withheld']

In [16]:
order = [
    'id',
    'text',
    'author_id',
    'created_at',
    'public_metrics',
    'reply_settings',
    'entities',
    'possibly_sensitive',
    'edit_history_tweet_ids',
    'source',
    'lang',
    'referenced_tweets',
    'conversation_id',
    'in_reply_to_user_id',
    'attachments',
    'withheld'
]

len(order)

16

In [18]:
df = final_df[order]

In [21]:
df.to_csv('hurricane_ian_3day_increment.csv', index=False)

In [23]:
# df.head()