In [2]:
# Read up about glob.glob. Use it to read the text files in ./data/
# speeches into a corpus (i.e. a list of strings). The files represent 
# a non-random selection of speeches of central bankers, which have 
# already been stripped off meta information.

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import glob

data = glob.glob('01_Data/speeches/*',recursive=False)
data

['01_Data/speeches/R131127E',
 '01_Data/speeches/R130312E',
 '01_Data/speeches/R121113C',
 '01_Data/speeches/R061109D',
 '01_Data/speeches/R160412B',
 '01_Data/speeches/R130117B',
 '01_Data/speeches/R100519A',
 '01_Data/speeches/R021028A',
 '01_Data/speeches/R110309A',
 '01_Data/speeches/R120727A',
 '01_Data/speeches/R040216C',
 '01_Data/speeches/R061120D',
 '01_Data/speeches/R021126C',
 '01_Data/speeches/R090902D',
 '01_Data/speeches/R070829A',
 '01_Data/speeches/R050211A',
 '01_Data/speeches/R110124C',
 '01_Data/speeches/R140519A',
 '01_Data/speeches/R020115B',
 '01_Data/speeches/R161014D',
 '01_Data/speeches/R120727D',
 '01_Data/speeches/R090224E',
 '01_Data/speeches/R031211G',
 '01_Data/speeches/R090608C',
 '01_Data/speeches/R040806D']

In [8]:
# Vectorize the speeches using tfidf using up 1-grams, 2-grams and 
# 3-grams while removing English stop- words and proper tokenization 
# (i.e., you create a Count matrix).

text=[]
for name in data:
    try:
        with open(name, 'r') as inf:
            text.extend(inf.readlines())
        text
    except:
        print(name)
        
len(text)

01_Data/speeches/R021126C
01_Data/speeches/R090902D


23

In [9]:
import nltk

_stopwords = nltk.corpus.stopwords.words('english')
_stopwords

from string import digits, punctuation

remove = digits + punctuation
remove

_stemmer = nltk.snowball.SnowballStemmer('english')


In [11]:
def tokenize_and_stem(text):
    """Return tokens of document deprived of numbers and interpunctuation."""
    text = text.translate(str.maketrans({p: "" for p in remove}))
    return [_stemmer.stem(t) for t in nltk.word_tokenize(text.lower())]

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words=_stopwords, tokenizer=tokenize_and_stem)
count.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',... 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_and_stem at 0x10d6788c8>,
        vocabulary=None)

In [13]:
count_matrix = count.transform(text)
count_matrix   

<23x6579 sparse matrix of type '<class 'numpy.int64'>'
	with 28500 stored elements in Compressed Sparse Row format>

In [16]:
terms = count.get_feature_names()
print(terms[:10])

['ab', 'abandon', 'abat', 'abhiyan', 'abid', 'abil', 'abl', 'abn', 'abnorm', 'abolish']


In [17]:
df = pd.DataFrame(count_matrix.toarray().T, index=terms)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
ab,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
abandon,0,0,2,0,1,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
abat,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,1
abhiyan,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abid,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words=_stopwords, tokenizer=tokenize_and_stem, ngram_range=(1,3))
tfidf_matrix = tfidf.fit_transform(text)
tfidf_terms = tfidf.get_feature_names()

df_tfidf = pd.DataFrame(tfidf_matrix.toarray().T, index=tfidf_terms)
df_tfidf.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
 z,0.0,0.0,0.0,0.0,0.0,0.0,0.012537,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
 z would,0.0,0.0,0.0,0.0,0.0,0.0,0.006269,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
 z z,0.0,0.0,0.0,0.0,0.0,0.0,0.006269,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
 zt,0.0,0.0,0.0,0.0,0.0,0.0,0.006269,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
 zt interest,0.0,0.0,0.0,0.0,0.0,0.0,0.006269,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# Pickle the resulting sparse matrix using pickle.dump() as ./out/
# speech_matrix.pk.

import pickle

file = open("02_output/speech_matrix.pk", 'wb')
pickle.dump(tfidf_matrix, file)
file.close()

fl = open("02_output/speech_matrix.pk", 'rb')
speech_matrix = pickle.load(fl)