# Combine the text of the SOTU addresses with some meta information

The dataset includes a file with meta information for every president and every speech. To facilitate processing, we want to have both the full texts and the meta information for each speech neatly bundled in a single dataframe.

This notebook reads in all the speeches and the meta information and combines them in a dataframe, that is then written into a CSV file so it can be reused by other notebooks.

In [1]:
from ast import literal_eval
from pathlib import Path

import pandas as pd

dataset_folder = Path('~/shared/RR-workshop-data/state-of-the-union-dataset').expanduser()

full_text_folder = dataset_folder / 'txt'
meta_folder = dataset_folder / 'meta'

speeches = []
presidents = []
years = []

for file in full_text_folder.glob('*.txt'):
    speeches.append(file.read_text())
    president, year = file.stem.split('_')
    presidents.append(president)
    years.append(int(year))

df = pd.DataFrame(index=years, data={'President': presidents, 'Text': speeches}).sort_index()

# Read metadata
presidents = pd.read_csv(meta_folder / 'presidents.csv', converters={"Party": literal_eval})
presidents['First Year'] = presidents['Term Start'].str.extract(r',\s([0-9]{4})').astype("int")
presidents['Last Year'] = presidents['Term End'].str.extract(r',\s([0-9]{4})').astype("float")

In [26]:
def handle_special_party_case(year, president):
    """ Manually handle some cases where the party affiliation is ambiguous """
    party = None
    if president['Last Name'] in ['Adams', 'Tyler', 'Johnson']:
        party = president['Party'][0]
    else:
        raise NotImplementedError("[handle_special_party_case] Unhandled special case!")   
    return party

def add_meta(row):
    """ Adds meta information to a row in the dataframe """
    year = row.name
    last_name = row['President']
    president = presidents[presidents['Last Name'] == last_name].squeeze()
    if isinstance(president, pd.DataFrame):
        president = president[(president['First Year'] <= year) & (president['Last Year'] >= year)].squeeze()
    
    first_name = president['First Name(s)']
    row['First Name'] = first_name
    party = president['Party']
    if len(party) > 1:
        party = handle_special_party_case(year, president)
    else:
        party = party[0]
        
    row['Party'] = party

    return row

df = df.apply(add_meta, axis='columns')
df['Name'] = df['First Name'] + ' ' + df['President']

array(['George Washington', 'John Adams', 'Thomas Jefferson',
       'James Madison', 'James Monroe', 'John Quincy Adams',
       'Andrew Jackson', 'Martin Van Buren', 'John Tyler',
       'James K. Polk', 'Zachary Taylor', 'Millard Fillmore',
       'Franklin Pierce', 'James Buchanan', 'Abraham Lincoln',
       'Andrew Johnson', 'Ulysses S. Grant', 'Rutherford B. Hayes',
       'Chester A. Arthur', 'Grover Cleveland', 'Benjamin Harrison',
       'William McKinley', 'Theodore Roosevelt', 'William Howard Taft',
       'Woodrow Wilson', 'Warren G. Harding', 'Calvin Coolidge',
       'Herbert Hoover', 'Franklin D. Roosevelt', 'Harry S. Truman',
       'Dwight D. Eisenhower', 'John F. Kennedy', 'Lyndon B. Johnson',
       'Richard Nixon', 'Gerald Ford', 'Jimmy Carter', 'Ronald Reagan',
       'George H. W. Bush', 'Bill Clinton', 'George W. Bush',
       'Barack Obama', 'Donald Trump'], dtype=object)

In [None]:
df.insert(0, 'Name', df.pop('Name'))
df.insert(1, 'First Name', df.pop('First Name'))
df.insert(2, 'Last Name', df.pop('President'))
df.insert(3, 'Party', df.pop('Party'))

In [5]:
Path('data').mkdir(exist_ok=True, parents=True)
df.to_csv('data/sotu-extended.csv', index_label='Year')