In [None]:
## Author: Dejun Xiang
## ID: 349329
## Project: Donald Trump analytics
## Supervisor: Prof. Richard O. Sinnott
## Twitter crawler --> Selenium + Chrome webdriver

import time
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

In [None]:
#------------build the url we need to crawl-------------------------
'''
Function: build the url of the page we need the 
          search key world is "Donald Trump", in English
input: y1,m1,d1 --> the since date
       y2,m2,d2 --> the until date
Output: return the url with string type
'''
def build_url(y1,m1,d1,y2,m2,d2):
    return "https://twitter.com/search?l=en&q=Donald%20Trump%20since%3A"+since_date(y1,m1,d1)+"%20until%3A"+until_date(y2,m2,d2)+"&src=typd"

def since_date(y,m,d):
    if len(str(m))==1:
        m = "0"+ str(m)
    if len(str(d))==1:
        d = "0"+ str(d)
    return str(y)+"-"+str(m)+"-"+str(d)

def until_date(y,m,d):
    if len(str(m))==1:
        m = "0"+ str(m)
    if len(str(d))==1:
        d = "0"+ str(d)
    return str(y)+"-"+str(m)+"-"+str(d)
#-------------------------------------------------------------------


'''
Function: scroll down to the bottom of the page
Input: webdriver
'''
def scroll_bottom(driver):
    lastHeight = 0
    count = 0
    while True:
        # in case being traped by infinite loop
        if count > 500:
            break
        count += 1
        
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # for the browser to process
        time.sleep(3)
        
        newHeight = driver.execute_script("return document.body.scrollHeight")
        # newHeight=lastHeight means we are at the bottom
        if newHeight == lastHeight:
            break
        lastHeight = newHeight


In [None]:
# only focus on 01/01/2018 to 13/05/2018
y = 2018

#in order to have header 
count_write = 1

# get the result from 01/01/2018 to 13/05/2018
for m in range(1,5):
    for d in range(1,32):
        if m==5 and d >= 14:
            break

        # skip 29th, 30th,31th Feb.
        if m==2 and d >= 29:
            continue
        
        try:
            #initialize web driver
            options = webdriver.ChromeOptions()
            # in order to turn the time language to English
            options.add_experimental_option('prefs', {'intl.accept_languages': 'en,en_US'})
            driver = webdriver.Chrome(chrome_options=options)
            url =build_url(y,m,d,y,m,(d+1))
            driver.get(url)
            scroll_bottom(driver)
            print("finish scroll -->"+ since_date(y,m,d))

            # get all the tweets
            tweets = driver.find_elements_by_class_name("js-tweet-text-container")
            # get all id
            ids = driver.find_elements_by_class_name("FullNameGroup")
            # get all posting date
            tweet_date = driver.find_elements_by_class_name("_timestamp")
            # number of retreets
            retreets = driver.find_elements_by_xpath('//button[@class="ProfileTweet-actionButtonUndo js-actionButton js-actionRetweet"]//span[@class="ProfileTweet-actionCountForPresentation"]')
            # number of likes
            likes = driver.find_elements_by_xpath('//button[@class="ProfileTweet-actionButton js-actionButton js-actionFavorite"]//span[@class="ProfileTweet-actionCountForPresentation"]')
            # number of replys
            replys = driver.find_elements_by_xpath('//button[@class="ProfileTweet-actionButton js-actionButton js-actionReply"]//span[@class="ProfileTweet-actionCountForPresentation"]')
            dates = driver.find_elements_by_xpath('//span[@data-long-form="true"]')
            

            df = pd.DataFrame(data = [id.text for id in ids],columns=["id"])
            df['tweet'] = np.array([tweet.text for tweet in tweets])
            df['likes'] = np.array([like.text for like in likes])
            df['replys'] = np.array([reply.get_attribute('textContent') for reply in replys])
            #filled up the empty block with zero
            df.replys = df[['replys']].convert_objects(convert_numeric=True).fillna(0)
            df['retreets'] = np.array([retreet.get_attribute('textContent') for retreet in retreets])
            df.retreets = df[['retreets']].convert_objects(convert_numeric=True).fillna(0)
            df['dates'] = np.array([date.text for date in dates])
            
            if count_write == 1:
                df.to_csv(r"C:\Users\Derek\Desktop\twitter\16.csv",index=False)

            df.to_csv(r"C:\Users\Derek\Desktop\twitter\data1.csv",mode='a',index=False,header=False)
            count_write += 1
            
            print(until_date(y,m,d)+" finished")
            driver.close()
        except Exception as e:
            # do not stop execution, just log and continue
            driver.close()
            print(e)
            

In [None]:
'''
Function: reservior sampler
          sampling randomly and uniformly(can be proved by math)
          --> find the random index first, then get the rows
Input: k (int) --> the number of samples needed
       dataset (dataframe)--> the dataframe sampled from
Output: sampled dataframe
'''
def sampler(k,dataset):
    index = []
    num_rows = len(dataset)
    if k > num_rows or k <= 0:
        return "sampling size 'k' is not valid, try other 'k'"
    for i in range(0,num_rows):
        if i < k:
            index.append(i)
        else:
            random_index = random.randint(0,i)
            if random_index < k:
                index[random_index] = i
    return dataset.iloc[index]