In [None]:
import pandas as pd
import numpy as np

In [None]:
import json
import datetime
import random

In [None]:
import config
import request_history

In [None]:
# hurricane sandy: formed: 1 July 2014, dissipated: 9 July 2014
date_ = '2012-09-15T00:00:00.000Z'

In [None]:
# hurricane arthur: formed: 1 July 2014, dissipated: 9 July 2014
# date_ = '2014-06-01T00:00:00.000Z'

In [None]:
# hurricane florence: formed: 31 Aug 2018, dissipated: 18 Sept 2018
# date_ = '2018-08-01T00:00:00.000Z'

In [None]:
# hurricane barry: formed: 11 July 2019, dissipated: 19 July 2019
# date_ = '2019-06-15T00:00:00.000Z'

In [None]:
# hurricane Isaias: formed: 30 July 2020, dissipated: 5 Aug 2020
# date_ = '2020-07-01T00:00:00.000Z'

In [None]:
# hurricane nicholas: formed: 12 Sept 2021, dissipated: 20 Sept 2021
# date_ = '2021-08-15T00:00:00.000Z'

In [None]:
# hurricane terms adopted from the Stowe paper: 
# change the name of the storm per date range
query_terms = "Category 3 OR Category 4 OR landfall OR eyewall OR cleanup OR debris OR garbage OR hurricane OR hurricaneian OR perfectstorm OR sandy OR stormporn OR superstorm OR Tropical Storm"

In [None]:
# update dates iterates by 5 days + random hour selection - 500 tweets filled up in minutes
# so a random hour selection was used to interject variability in the time selection 

def generate_dates(old_end):
    """
    updates the start and end dates to pull tweets from the Twitter API (Academic Permissions)
    arguments:
        - old_end: string formated date to start the window for pulling set of 500 tweets
        
    return:
        - new_start: the previous end date
        - new_end: 5 days from the old end, at a random hour
    """
    hour_increment = random.randint(0, 23)
    new_start = old_end

    old_date = datetime.datetime.strptime(old_end, '%Y-%m-%dT%H:%M:%S.%fZ')
    new_end_dtg = old_date + datetime.timedelta(days=3, hours=hour_increment)
    new_end = datetime.datetime.strftime(new_end_dtg, '%Y-%m-%dT%H:%M:%S.%fZ')

    return new_start, new_end

In [None]:
def next_batch(previous_end, query_terms=query_terms, generate_dates=generate_dates):
    # request 500 tweets at a time
    time_now = datetime.datetime.now()
    print(f"Time of most recent request: {time_now.hour}:{time_now.minute}")

    start, end = generate_dates(previous_end)
    tweet_batch = request_history.main(query_terms, start, end)

    return end, tweet_batch

In [None]:
def create_tweets_df(raw_list):
    list_of_tweets = []
    for tweet in raw_list:
        current_df = pd.DataFrame([tweet])
        list_of_tweets.append(current_df)
    df = pd.concat(list_of_tweets, ignore_index=True)
    return df

In [None]:
all_dfs = []
for _ in range(25):
    try:
        date_, tweets = next_batch(date_)
        df = create_tweets_df(tweets['data'])
        all_dfs.append(df)
    except Exception as e:
        print("Final date: ", date_)
        print("API errors: ", e)
        break

In [None]:
final_df = pd.concat(all_dfs, ignore_index=True)
final_df.shape

In [None]:
date_

In [None]:
cols = list(final_df.columns)
cols

In [None]:
order = [
    'id',
    'text',
    'author_id',
    'created_at',
    'public_metrics',
    'reply_settings',
    'entities',
    'possibly_sensitive',
    'edit_history_tweet_ids',
    'source',
    'lang',
    'referenced_tweets',
    'conversation_id',
    'in_reply_to_user_id',
    'attachments'    
]

len(order)

In [None]:
df = final_df[order]

In [None]:
df.to_csv('./ADD_HUR_DATA/sandy_2012_3day_increment_v2.csv', index=False)
# df.to_csv('./ADD_HUR_DATA/arthur_2014_3day_increment_v2.csv', index=False)
# df.to_csv('./ADD_HUR_DATA/florence_2018_3day_increment.csv', index=False)
# df.to_csv('./ADD_HUR_DATA/barry_2019_3day_increment.csv', index=False)
# df.to_csv('./ADD_HUR_DATA/isaias_2020_3day_increment.csv', index=False)
# df.to_csv('./ADD_HUR_DATA/nicholas_2021_3day_increment.csv', index=False)

In [None]:
df.head()