# Text Mining - State of the Union Addresses

In this notebook we will take a look at the text from recent US State of the Union addresses to demonstrate some of our text mining functions.

We will use the Python library `BeautifulSoup` a powerful tool for scraping text from web sites.

[This web site](https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union) contains links to the text from all SOTU addresses back to George Washington!  For our analysis we will focus on the addresses from the past 50 years.


In [None]:
!pip install beautifulsoup4
!pip install nltk
!pip install afinn
!pip install spacy
!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import numpy as np
import requests
import spacy
from bs4 import BeautifulSoup

# Import State of the Union text

This data comes from transcripts of all SOTU speeches leading back to George Washington.

In [None]:
## Import State of the Union text

# URL of the page containing the table of speeches
url = "https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/annual-messages-congress-the-state-the-union"

# Step 1: Fetch the webpage
response = requests.get(url)
webpage_content = response.content

# Step 2: Parse the webpage content
soup = BeautifulSoup(webpage_content, "html.parser")

# Step 3: Find the table containing speeches
table = soup.find("table")

# Step 4: Extract all speech links from the table
speech_links = []

for a in table.find_all("a", href=True):
    link = a['href']
    # Ensure correct URL format
    full_link = f"{link}"
    speech_links.append(full_link)

# Function to extract speech text from each link
def extract_speech_text(speech_url):
    try:
        speech_response = requests.get(speech_url)
        speech_soup = BeautifulSoup(speech_response.content, "html.parser")
        speech_div = speech_soup.find("div", class_="field-docs-content")

        # Combine all paragraphs to form the full speech
        speech_text = ""
        for p in speech_div.find_all("p"):
            speech_text += p.get_text() + "\n"

        return speech_text
    except Exception as e:
        print(f"Error fetching speech from {speech_url}: {e}")
        return ""

#should be 245 speeches, but we are including the radio interviews present in the table as well

num_recent_speeches = 48 # This goes back to 1977 - Jimmy Carter

total_text = []
count = 1
# Step 5: Iterate through each link and extract speech text
for link in speech_links[:num_recent_speeches]:
    speech_text = extract_speech_text(link)
    if speech_text:
        print(count)
        count += 1
        total_text.append(speech_text)

Want to know which speech is from which President?  [Load this file](https://drive.google.com/uc?download&id=1tavRJ1Y3gOwRpaCAZrcAtztAdZzy6qJM).  

In [None]:
# file upload
from google.colab import files
uploaded = files.upload()


In [None]:

SOTU_index = pd.read_csv("SOTU_index.csv")
SOTU_index.Year

#Text cleaning

This code cleans a list of text by removing punctuation (using regexp), converting to lowercase (`.lower()`), tokenizing, removing stopwords, and lemmatizing each word to its base form for more consistent and meaningful text processing.

In [None]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# Download WordNet data for lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Stopwords list
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text_list):
    cleaned_texts = []
    for text in text_list:
        # Remove punctuation and lowercase the text
        text = re.sub(r'[^\w\s]', '', text.lower())
        # remove numbers
        text = re.sub(r'[\d]', '', text)

        # Tokenize the text - split into an array of words.
        words = word_tokenize(text)

        # Remove stopwords and lemmatize each word
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

        # Join the words back into a single string
        cleaned_text = ' '.join(words)
        cleaned_texts.append(cleaned_text)

    return cleaned_texts


cleaned_total_text = clean_text(total_text)


In [None]:
# Pick a year and print the cleaned speech from that year.

year=????
idx = SOTU_index[SOTU_index['Year'] == year].index[0]
print(SOTU_index.iloc[[idx]])
cleaned_total_text[idx]



# Basic Analysis

The code analyzes each speech in `total_text` by calculating its length, word count, and average syllables per word, then stores these metrics in a DataFrame for easy visualization and comparison.

In [None]:
nltk.download('cmudict')

[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


True

In [None]:
import pandas as pd
from nltk.corpus import cmudict
from nltk import word_tokenize

# Load the CMU Pronouncing Dictionary for syllable counting
d = cmudict.dict()

# Function to count syllables in a word
def syllable_count(word):
    word = word.lower()
    if word in d:
        return [len(list(y for y in x if y[-1].isdigit())) for x in d[word]][0]
    else:
        # If the word isn't in the CMU dictionary, return a default of 1 syllable
        return 1

# Initialize lists to store analysis results
speech_lengths = []
word_counts = []
average_syllables_per_word = []

# Perform analysis on each speech in total_text
for speech in total_text:
    # Calculate length in characters
    length = len(speech)
    speech_lengths.append(length)

    # Calculate word count
    words = word_tokenize(speech)
    word_count = len(words)
    word_counts.append(word_count)

    # Calculate average syllables per word
    syllables = [syllable_count(word) for word in words]
    if words:
        avg_syllables = sum(syllables) / len(words)
    else:
        avg_syllables = 0
    average_syllables_per_word.append(avg_syllables)

df = pd.DataFrame({
    'Speech Number': range(1, len(total_text) + 1),
    'Length (Characters)': speech_lengths,
    'Word Count': word_counts,
    'Average Syllables per Word': average_syllables_per_word
})

# sort the data by the Year (the original data is not sorted by year)

df['Year']=SOTU_index['Year']
# plot word count in year order
# order the df by Year
df = df.sort_values(by=['Year'])
df.head()

In [None]:
# prompt: make a plot of the Word Count by time

import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame with 'Speech Number' and 'Word Count' columns

plt.figure(figsize=(12, 6))
plt.plot(df['Year'], df['Word Count'])
plt.xlabel("Speech Number")
plt.ylabel("Word Count")
plt.title("Word Count by Speech Year")
divisible_by_4_years = df['Year'][df['Year'] % 4 == 0].unique()

# Add vertical lines for each election year
for year in divisible_by_4_years:
    plt.axvline(x=year, color='red', linestyle='--', linewidth=0.8)

plt.show()

# Finding themes in the SOTU speeches

## Using TFIDF for most "important" words

We will create the TF-IDF matrix of the SOTU to try and bring out the most relevant words from each speech.

In [None]:
# create a TF/IDF matrix for the speeches

from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer object
# the max_df is key here to avoid words like "state, government, tax, etc that appear in many speeches"
vectorizer = TfidfVectorizer(max_df=0.7,min_df=2,ngram=(1,2)) # these values are important!

# Fit the vectorizer to the cleaned text data
tfidf_matrix = vectorizer.fit_transform(cleaned_total_text)

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Print the shape of the TF-IDF matrix
print(tfidf_matrix.shape)

# You can access the TF-IDF values for each document and term
# For example, to get the TF-IDF values for the first document:
#print(tfidf_matrix[0])


In [None]:
# prompt: find the actual words that have the highest TF-IDF value for a given speech

year = 2002

idx = df[df['Year'] == year].index[0]

# Get the TF-IDF values for that year's document
first_document_tfidf = tfidf_matrix[idx]

topn = 20

# Get the indices of the top 10 TF-IDF values
# note: tocoo() upacks the sparse matrix so you can print it.
top_indices = first_document_tfidf.tocoo().col[first_document_tfidf.tocoo().data.argsort()[-topn:][::-1]]

# Get the actual words corresponding to the top indices
top_words = [feature_names[i] for i in top_indices]

# Print the top words
print(top_words)


## Topic Modelling - NMF (Non-negative Matrix Factorization)

Helps to find common themes across speeches.  We will use NMF, a matrix decomposition method that works on similar ideas to Principle Components.

In [None]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=25, init='nndsvd', random_state=0)
nmf_model.fit(tfidf_matrix)

In [None]:
# Extract the topics from NMF

topic_word_matrix = nmf_model.components_
document_topic_matrix = nmf_model.transform(tfidf_matrix)

In [None]:
### Print out the topics

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Number of top words to display per topic
num_top_words = 15

# Print topics and their top words
for topic_idx, topic in enumerate(nmf_model.components_):
    print(f"Topic {topic_idx + 1}:")
    print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-num_top_words - 1:-1]]))
    print("-" * 20)  # Separator between topics

In [None]:
## Print out the dominant topic for each speech

import numpy as np

dominant_topics = np.argmax(document_topic_matrix, axis=1)

def get_top_words(topic_idx, n_words=15): # Returns the top n words for a given topic index
  top_word_indices = nmf_model.components_[topic_idx].argsort()[:-n_words - 1:-1]
  return [feature_names[i] for i in top_word_indices]

for doc_idx, topic_idx in enumerate(dominant_topics):
  year = SOTU_index.loc[doc_idx, 'Year']
  print(f"SOTU Index {doc_idx + 1}: Year {year}")
  print(f"Dominant Topic: {topic_idx + 1}")  # Add 1 for human-readable indexing
  print(f"Top Words: {', '.join(get_top_words(topic_idx))}")
  print("-" * 20)

The methods that work with long texts like SOTU wont necessarily work so well with shorter ones like tweets...it will probably take a lot of trial and error!

# Named Entity Recognition

We can extract all Named Entities from the document, and then summarize the entities of a particular type:

In [None]:
# Extract all Named Entities - note the different types


# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Select a speech from your total_text list
year = 2021
idx = df[df['Year'] == year].index[0]
speech_to_analyze = total_text[idx]  # For example, the first speech

# Apply NER to the speech
doc = nlp(speech_to_analyze)

#Print the named entities
for ent in doc.ents:
    print(f"{ent.text} - {ent.label_}")

In [None]:
# Now create a table of all of the Persons (or any other type)

from collections import Counter

all_persons = []

for ent in doc.ents:
  if ent.label_ == "ORG":
    all_persons.append(ent.text)

person_counts = Counter(all_persons)

person_df = pd.DataFrame.from_dict(person_counts, orient='index', columns=['Frequency'])
person_df = person_df.reset_index().sort_values(by=['Frequency'], ascending=False)
person_df.rename(columns={'index': 'Person'}, inplace=True)

print(person_df)

# Sentiment analysis

using `AFINN`


In [None]:
from afinn import Afinn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

afinn = Afinn()

# Create a dictionary to store sentiment scores indexed by year
sentiment_scores_by_year = {}

# make sure to normalize the scores by the number of words!!!

for i, speech in enumerate(cleaned_total_text):
    score = afinn.score(speech)
    num_words = len(speech.split())
    normalized_score = score / num_words if num_words else 0

    # Get the year for this speech from SOTU_index
    year = SOTU_index.loc[i, 'Year']

    sentiment_scores_by_year[year] = normalized_score


In [None]:

# Convert the dictionary to a Pandas Series for easier plotting
sentiment_series = pd.Series(sentiment_scores_by_year)

# Plot sentiment scores over time
plt.plot(sentiment_series.index, sentiment_series.values, 'o')
plt.xlabel("Year")
plt.ylabel("Normalized Sentiment Score")
plt.title("Sentiment Analysis of State of the Union Addresses (AFINN)")
elec_year = np.arange(1976, 2025, 4)

for year in elec_year:
    plt.axvline(x=year, color='red', linestyle='--', linewidth=0.8)
plt.show()

In [None]:
# for fun, plot the most positive (or negative words):
from afinn import Afinn
import nltk

afinn = Afinn()
year = 2002  # Change this to the desired year

# Get the index of the speech for the given year
idx = SOTU_index[SOTU_index['Year'] == year].index[0]

# Get the cleaned speech text for the given year
speech_text = cleaned_total_text[idx]

# Tokenize the speech text into words
words = nltk.word_tokenize(speech_text)

# Create a dictionary to store word scores
word_scores = {}
for word in words:
    score = afinn.score(word)
    word_scores[word] = score

# Sort the words by score in descending order
pos_words = sorted(word_scores.items(), key=lambda item: item[1], reverse=True)

neg_words = sorted(word_scores.items(), key=lambda item: item[1],reverse=False)

# Print the top 10 most positive words
print(f"Top 10 most positive words in the speech for {year}:")
print("POSITIVE")
for word, score in pos_words[:10]:
    print(f"{word}: {score}")
print("\nNEGATIVE")
for word, score in neg_words[:10]:
    print(f"{word}: {score}")
