In [1]:
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import time
import random
import re
import openai
import os
import sqlite3

In [2]:
# Needed to pass Rev's server checks.

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

openai.api_key = os.getenv("OPENAI_API_KEY")

db = sqlite3.connect("../data/ConventionSpeeches.db")
cursor = db.cursor()

In [3]:
def get_links(soup) : 
    divs = soup.find_all('div', class_='fl-post-grid-post')

    # Iterate over each div and extract the link
    links = []
    for div in divs:
        meta_tag = div.find('meta', itemprop='mainEntityOfPage')
        if meta_tag and meta_tag.has_attr('itemid'):
            link = meta_tag['itemid']
            links.append(link)

    return links


def get_min_date(soup) :
    date_divs = soup.find_all('meta', itemprop='datePublished')
    dates = []

    for date_element in date_divs :
        if date_element.has_attr('content'):
            dates.append(date_element['content']) 

    return min(dates)


## Iterate over pages and get links

In [11]:
# Start with the first page
r = requests.get("https://www.rev.com/blog/transcripts/all-transcripts", headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
links = get_links(soup)

print(f"The current min date is {get_min_date(soup)}")

The current min date is 2024-09-19


In [13]:
links = []

for page in range(2,26) : 
    url = f"https://www.rev.com/blog/transcripts/all-transcripts/page/{page}"
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')

    links.extend(get_links(soup))
    print(f"The current min date is {get_min_date(soup)}")

    time.sleep(2  + random.random())




The current min date is 2024-09-17
The current min date is 2024-09-16
The current min date is 2024-09-12
The current min date is 2024-09-10
The current min date is 2024-09-09
The current min date is 2024-09-03
The current min date is 2024-08-29
The current min date is 2024-08-27
The current min date is 2024-08-22
The current min date is 2024-08-22
The current min date is 2024-08-21
The current min date is 2024-08-20
The current min date is 2024-08-19
The current min date is 2024-08-14
The current min date is 2024-08-12
The current min date is 2024-08-07
The current min date is 2024-08-06
The current min date is 2024-07-23
The current min date is 2024-07-19
The current min date is 2024-07-18
The current min date is 2024-07-17
The current min date is 2024-07-17
The current min date is 2024-07-16
The current min date is 2024-07-15


Let's save the links at this point to be careful

In [14]:
with open('rev_conv_links.txt', 'w') as f:
    for link in links:
        f.write(f"{link}\n")

Things that still need to be done

1. Cut down links to just DNC/RNC
1. Write code to extract the speech
1. Do speech extraction
1. Store results

## Week 4 Continuation

In [4]:
with open('rev_conv_links.txt', 'r') as f:
  links = [line.strip() for line in f.readlines()]

pprint(links)

['https://www.rev.com/blog/transcripts/democrats-speak-to-press-on-budget-bill',
 'https://www.rev.com/blog/transcripts/republicans-speak-to-press-on-budget-bill',
 'https://www.rev.com/blog/transcripts/kamala-harris-speaks-with-nabj-reporters-in-philadelphia',
 'https://www.rev.com/blog/transcripts/pentagon-press-briefing-on-9-17-24',
 'https://www.rev.com/blog/transcripts/karine-jean-pierre-white-house-press-briefing-on-9-17-24',
 'https://www.rev.com/blog/transcripts/trump-holds-first-event-since-second-assasination-attempt',
 'https://www.rev.com/blog/transcripts/sean-combs-arrest-press-conference',
 'https://www.rev.com/blog/transcripts/biden-at-national-hbcu-week-conference',
 'https://www.rev.com/blog/transcripts/tiktok-hearing-on-potential-ban',
 'https://www.rev.com/blog/transcripts/desantis-gives-update-on-trump-assassination-attempt',
 'https://www.rev.com/blog/transcripts/state-department-press-briefing-on-9-16-24',
 'https://www.rev.com/blog/transcripts/titan-submersible-h

In [5]:
# Cut down to just RNC/DNC
conv_links = [link for link in links if "convention" in link or "dnc" in link or "rnc" in link]
non_conv_links = [link for link in links if link not in conv_links] # Used this to look through

In [6]:
time_pattern = re.compile(r'\((\d{2}:)?\d{2}:\d{2}\)\n')

def generate_file_name_from_url(url):
    candidate = url.split('/')[-1].replace('-', '_').replace('.html', '.txt')
    if not candidate.endswith('.txt'):
        candidate += '.txt'
    return candidate

def extract_speech(soup) : 
    for item in soup.find_all('div', class_='fl-callout-text') : 
        grafs = item.find_all('p')
        speech = [graf.text for graf in grafs]
        speech = [time_pattern.sub('',line) for line in speech] # Remove timestamps

    return '\n'.join(speech)

def get_party_from_link(url) : 
    if 'dnc' in url or "democratic" in url : 
        return 'Democratic'
    elif 'rnc' in url or "republican" in url : 
        return 'Republican'
    else : 
        return 'Other'

def get_speaker(page_text) : 
    system_prompt = "You are an expert in 2024 political speech. Please return just the person's name."
    user_prompt = "Who is speaking in this speech?\n\n"


    chat_response = openai.chat.completions.create(
            model="gpt-4o-mini", # did 4o and ran up an $18 bill in 5 minutes
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt + page_text}

            ],
            temperature=0,
            max_tokens=1024,
    )

    return chat_response.choices[0].message.content
    

In [7]:
for speech_link in conv_links:
    r = requests.get(speech_link, headers=headers)

    if r.status_code != 200:
        print(f"Failed to get {speech_link}")
        continue


    soup = BeautifulSoup(r.text, 'html.parser')

    year = 2024
    party = get_party_from_link(speech_link)
    day = None
    speaker = get_speaker(r.text)
    speaker_count = 1
    time = "00:00:00"

    text = extract_speech(soup)
    text_len = len(text.split())

    file = generate_file_name_from_url(speech_link)

    with open("/users/chandler/dropbox/research/convention-speeches/data/2024/" + file, 'w') as f:
        f.write(text)

    data_row = [
        year,
        party,
        day,
        speaker,
        speaker_count,
        time,
        text,
        text_len,
        file
    ]

    cursor.execute("INSERT INTO conventions VALUES (?,?,?,?,?,?,?,?,?)", data_row)
    
    db.commit()

