# Exploratory Data Analysis of the State of the Union dataset

In [41]:
""" Import the data into dataframe """
from ast import literal_eval
from pathlib import Path

import pandas as pd
pd.set_option('display.max_rows', 500)  # We want to see the whole dataframe

dataset_folder = Path('../state-of-the-union-dataset')
full_text_folder = dataset_folder / 'txt'
meta_folder = dataset_folder / 'meta'

speeches = []
presidents = []
years = []

for file in full_text_folder.glob('*.txt'):
    speeches.append(file.read_text())
    president, year = file.stem.split('_')
    presidents.append(president)
    years.append(int(year))

df = pd.DataFrame(index=years, data={'President': presidents, 'Text': speeches}).sort_index()

# Read metadata
presidents = pd.read_csv(meta_folder / 'presidents.csv', converters={"Party": literal_eval})
presidents['First Year'] = presidents['Term Start'].str.extract(r',\s([0-9]{4})').astype("int")
presidents['Last Year'] = presidents['Term End'].str.extract(r',\s([0-9]{4})').astype("float")
speeches_meta = pd.read_csv(meta_folder / 'speeches-meta.csv')

In [42]:
def handle_special_party_case(year, president):
    """ Manually handle some cases where the party affiliation is ambiguous """
    party = None
    if president['Last Name'] in ['Adams', 'Tyler', 'Johnson']:
        party = president['Party'][0]
    else:
        raise NotImplementedError("[handle_special_party_case] Unhandled special case!")   
    return party

def add_meta(row):
    """ Adds meta information to a row in the dataframe """
    year = row.name
    last_name = row['President']
    president = presidents[(presidents['First Year'] <= year) & (presidents['Last Year'] > year)].squeeze()
    first_name = president['First Name(s)']
    row['First Name'] = first_name
    party = president['Party']
    if len(party) > 1:
        party = handle_special_party_case(year, president)
    else:
        party = party[0]
        
    row['Party'] = party

    return row

df = df.apply(add_meta, axis='columns')
df.insert(0, 'First Name', df.pop('First Name'))
df.insert(1, 'Last Name', df.pop('President'))
df.insert(2, 'Party', df.pop('Party'))


In [59]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('stopwords')
stopwords_ = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/f006pfk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:
def tokenize(text: str):
    """ Tokenize string first into sentences and then into words """


df['Tokens'] = df['Text'].apply(tokenize)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/f006pfk/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.10/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [None]:
df['Cleaned Tokens'] = df['Tokens'].apply(lambda tokens: [w for w in tokens if not w.lower() in stopwords_])

In [56]:
df

Unnamed: 0,First Name,Last Name,Party,Text,Tokens
1790,George,Washington,Unaffiliated,"Fellow Citizens of the Senate, and House of Re...","Fellow Citizens of the Senate, and House of Re..."
1791,George,Washington,Unaffiliated,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...
1792,George,Washington,Unaffiliated,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...
1793,George,Washington,Unaffiliated,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...
1794,George,Washington,Unaffiliated,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...
1795,George,Washington,Unaffiliated,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...
1796,George,Washington,Unaffiliated,Fellow-Citizens of the Senate and House of Rep...,Fellow-Citizens of the Senate and House of Rep...
1797,John,Adams,Federalist,Gentlemen of the Senate and Gentlemen of the H...,Gentlemen of the Senate and Gentlemen of the H...
1798,John,Adams,Federalist,Gentlemen of the Senate and Gentlemen of the H...,Gentlemen of the Senate and Gentlemen of the H...
1799,John,Adams,Federalist,Gentlemen of the Senate and Gentlemen of the H...,Gentlemen of the Senate and Gentlemen of the H...
