In [2]:
import json
import requests
import time
from requests_cache import CachedSession, FileCache

# Caches previously made requests
session = CachedSession('http_cache', backend=FileCache())

##################################################################################################
###############  Run this code block if you are running for the first time #######################
###                                                                                            ###
### This code block generates a urls.json file that contains the urls containing transcripts   ###
### of Donald Trump's speeches. If you already have a urls.json file, you don't need to run    ###
### this block                                                                                 ###


# BASE_URL = "https://factba.se/transcript/{title}"

# QUERY_URL = "https://factba.se/json/json-transcript.php?" 
# def extract_urls():
#     urls = []
#     done = False
#     page = 1
#     while not done:
#         params = {
#             "q": None, 
#             "p": page, 
#             "f": "w"
#         }
#         res = session.get(QUERY_URL, params)
#         res_json = res.json()
#         if len(res_json["data"]) == 0:
#             done = True
#         else:
#             items = list(map(lambda x: BASE_URL.format(title=x["slug"]), res_json["data"]))
#             urls += items
#             page += 1
#     return urls


# with open("urls.json", "w") as f:
#     urls = extract_urls()
#     f.write(json.dumps(urls))
##################################################################################################
##################################################################################################
##################################################################################################

In [3]:
import json
with open("urls.json", "r") as f:
    urls = json.load(f)

In [4]:
def get_speech_data(url):
    slug = url.split("https://factba.se/transcript/")[1]
    res = session.get(url)
    html_doc = res.text
    soup = BeautifulSoup(html_doc, 'html.parser')
    title_tag = soup.find("h1", class_="topic-page-header transcript-header")
    title = title_tag.text
    resultsblock = soup.find_all("div", class_="media topic-media-row mediahover")
    items = []
    for block in resultsblock:
        speaker = block.find("div", class_="speaker-label").text
        speech = block.find("div", class_="transcript-text-block").text
        if speaker == "Donald Trump":
            items.append(speech)
    data = {
        "slug": slug,
        "title": title, 
        "url": url,
        "transcript": "".join(items)
    }
    return data

In [5]:
from joblib import Memory
location = './cachedir'
memory = Memory(location, verbose=0)

get_speech_data = memory.cache(get_speech_data)

dataset = []

for i, url in enumerate(urls):
    # print(f"{i}. URL: {url}")
    d = get_speech_data(url)
    dataset.append(d)
    # time.sleep(0.2)

In [6]:
import pandas as pd 

df = pd.DataFrame(dataset)

In [11]:
df.head()

Unnamed: 0,slug,title,url,transcript
0,donald-trump-press-gaggle-marine-one-departure...,Press Gaggle: Donald Trump Speaks to the Press...,https://factba.se/transcript/donald-trump-pres...,It has been a great honor. The honor of a life...
1,donald-trump-remarks-final-joint-base-andrews-...,Remarks: Donald Trump at Joint Base Andrews Be...,https://factba.se/transcript/donald-trump-rema...,Thank you. Thank you very much. Thank you. [Au...
2,donald-trump-speech-farewell-address-january-1...,Speech: Donald Trump Delivers His Farewell Add...,https://factba.se/transcript/donald-trump-spee...,"My fellow Americans: Four years ago, we launch..."
3,donald-trump-vlog-non-violence-peaceful-transi...,Donald Trump Vlog: January 6th Insurrection an...,https://factba.se/transcript/donald-trump-vlog...,My fellow Americans. I want to speak to you to...
4,donald-trump-press-gaggle-air-force-one-depart...,Press Gaggle: Donald Trump Speaks to the Press...,https://factba.se/transcript/donald-trump-pres...,I think that big tech is doing a horrible thin...


In [12]:
df.shape

(3501, 4)

In [13]:
df.to_csv("./data/trump-speech-data.csv")