## Import Required Packages

In [None]:
import pandas as pd
import time

from selenium import webdriver
from bs4 import BeautifulSoup as bs
from datetime import datetime

---

## Web Scrape Twitter's #tsunami webpage (Specifically for infinite scroll webpages):


In [None]:
# Assign browser to be used; I used Chrome.
browser = webdriver.Chrome()

# Enter the URL to be scraped.
browser.get("https://twitter.com/hashtag/tsunami?lang=en")

# Selenium script to scroll to the bottom of webpage, 
# wait 1 second for the next batch of data to load, 
# then continue scrolling.  It will continue to do this until the page stops loading new data.
lenOfPage = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
match=False
while(match==False):
        lastCount = lenOfPage
        time.sleep(1)
        lenOfPage = browser.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
        if lastCount==lenOfPage:
            match=True

# The page has been fully scrolled.
# Now it is ready to actually scrape and store the data.
source_data = browser.page_source

# Use BeautifulSoup to Parse code.
bs_data = bs(source_data)

---

## Get Number of Tweets for reference:

In [None]:
# Each tweet was found with .find_all('p', {'class': 'TweetTextSize js-tweet-text tweet-text'})
# I entered these tweets into a list, and then returned the length, to know how many
# Tweets were scraped and to be used in for loop below.

num_tweets = len(list(bs_data.find_all('p', {'class': 'TweetTextSize js-tweet-text tweet-text'})))
num_tweets

---

## Transform scraped data into a list of dictionaries:

In [None]:
# Create empty master_list where we will append dictionaries containing tweet and its
# information:

master_list = []

# For loop through each tweet scraped above:
for i in range(num_tweets):
    
    # Use BeautifulSoup to sort through raw tweets (not cleaned, still containing HTML, etc.):
    tweets = bs_data.find_all('p', {'class': 'TweetTextSize js-tweet-text tweet-text'})

    # Create empty, temporary dictionary where each tweet's info will be stored:
    temp_dict = {}
    
    # Extract the handle of the tweet:
    temp_dict['handle']     = str(list(bs_data.find_all('a', {'class': 'tweet-timestamp js-permalink js-nav js-tooltip'}))[i]).split('href="/')[1].split('/status')[0]
    
    # Clean up/extract the cleaned tweet; if IndexError, just return an empty string (discard later on):
    try:
        temp_dict['tweet']      = str(tweets[i].get_text()).replace("\n", " ")
    except IndexError:
        temp_dict['tweet']      = ""
    
    # Organize the dictionary to include when (date and time) the tweet was sent out:
    temp_dict['day']        = str(list(bs_data.find_all('a', {'class': 'tweet-timestamp js-permalink js-nav js-tooltip'}))[i]).split('title="')[1].split('"><span ')[0].split('-')[1].lstrip().split(' ')[0]
    temp_dict['month']      = str(list(bs_data.find_all('a', {'class': 'tweet-timestamp js-permalink js-nav js-tooltip'}))[i]).split('title="')[1].split('"><span ')[0].split('-')[1].lstrip().split(' ')[1]
    temp_dict['year']       = str(list(bs_data.find_all('a', {'class': 'tweet-timestamp js-permalink js-nav js-tooltip'}))[i]).split('title="')[1].split('"><span ')[0].split('-')[1].lstrip().split(' ')[2]
    temp_dict['time']       = str(list(bs_data.find_all('a', {'class': 'tweet-timestamp js-permalink js-nav js-tooltip'}))[i]).split('title="')[1].split('"><span ')[0].split(' - ')[0]
    
    # Create a datetime object to be added to the dictionary:
    datetime_object = datetime.strptime(temp_dict['month'] + ' ' + temp_dict['day'] + ' ' + temp_dict['year'] + ' ' + temp_dict['time'].replace(" ", ""), '%b %d %Y %I:%M%p')
    
    # Add the tweet's datetime of when it was sent out:
    temp_dict['datetime']   = datetime_object
    
    # Append the temporary dictionary containing the tweet's information to the master_list:
    master_list.append(temp_dict)
    
    # Print i to show the status of the code:
    print(i)
    

# Turn the 'master_list' into a Pandas DataFrame, and assign to variable 'df':

df = pd.DataFrame(master_list)

# Rearrange columns in a more logical order:

df = df[['handle', 'tweet', 'if_tsunami', 'datetime', 'day', 'month', 'year', 'time']]

# Save the Pandas DataFrame to a .csv file:

df.to_csv('Addtnl_train_tweets.csv', index=False, sep=",")