In [None]:
import string
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

In [None]:
#nltk.download('inaugural')

from nltk.corpus import inaugural

In [None]:
inaugural

In [None]:
inaugural.fileids()

In [None]:
len(inaugural.fileids())

There are 58 inaugural speech text files in this corpus.

In [None]:
# extract file names
filenames = inaugural.fileids()

filenames[0]

In [None]:
# extract the speech text from a file

inaugural.raw(filenames[0])

In [None]:
all_punctuations = set(string.punctuation)

main_speech = {}

for filename in filenames:
    
    name = filename.split('.')[0]
    
    print ('Processing:', name)

    # read each line from the file and convert it into lowercase
    line = inaugural.raw(filename).lower()

    # remove all punctuations
    line_clean = ''.join(l for l in line if l not in all_punctuations)

    # remove all stop words (this will create a list of words)
    # in addition, use regex to remove non-alphabetic characters
    line_words = [re.sub("[^a-zA-Z' ]+", '', word) for word in line_clean.split() if word not in ENGLISH_STOP_WORDS
                 and word != 'applause']

    # join all those words to create a line (of text) again
    main_speech[name] = ' '.join(line_words)

In [None]:
# length (number of words) of each speech
for name, speech in main_speech.items():
    
    print (name, len(speech))

In [None]:
names = []
speech_lengths = []

for name, speech in main_speech.items():
    
    names.append(name)
    speech_lengths.append(len(speech))
    
sns.set(style='darkgrid')

plt.figure().set_size_inches(9, 12)

sns.barplot(y=names, x=speech_lengths, color='seagreen')

plt.xlabel('Number of words', fontsize=14);
#plt.xticks(rotation='vertical');

In [None]:
tfidf = TfidfVectorizer()

tfs_matrix = tfidf.fit_transform(main_speech.values())

feature_names = tfidf.get_feature_names()

scores = tfs_matrix.todense().tolist()

df = pd.DataFrame(scores, columns=feature_names, index=main_speech.keys())

df.head()

In [None]:
df.shape

This dataset can be used for further exploratory analysis, e.g., trends, cluster analysis, etc.

In [None]:
# most distinct words by speech

df.idxmax(axis=1)

In [None]:
df.T.corr()

In [None]:
# correlation between the 10 most recent speeches
corr_matrix = df.tail(10).T.corr()

plt.figure(figsize=(12, 9))

cmap = sns.diverging_palette(10, 220, n=20)

ax = sns.heatmap(corr_matrix, cmap=cmap)
ax.set_ylim(len(corr_matrix), 0);

In [None]:
# create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=np.bool))

plt.figure(figsize=(12, 9))

cmap = sns.diverging_palette(10, 220, n=20)

ax = sns.heatmap(corr_matrix, cmap=cmap, mask=mask, vmax=.4)
ax.set_ylim(len(corr_matrix), 0);