In [20]:
import nltk
import string
import numpy as np
import itertools as itt
import gensim

In [21]:
# Tokenisation

text = "The Governing Council decided to raise the three key ECB interest rates by 75 basis points."

bigrams = ["Governing Council", "interest rates", "basis points"]

for bigram in bigrams:
    text = text.replace(bigram, bigram.replace (" ", "_"))
bigrams = [bigram.replace(" ", "_") for bigram in bigrams]

tkns = nltk.word_tokenize(text)
tkns

['The',
 'Governing_Council',
 'decided',
 'to',
 'raise',
 'the',
 'three',
 'key',
 'ECB',
 'interest_rates',
 'by',
 '75',
 'basis_points',
 '.']

In [23]:
# Shift to lower case:
[word.lower() for word in tkns]

['the',
 'governing_council',
 'decided',
 'to',
 'raise',
 'the',
 'three',
 'key',
 'ecb',
 'interest_rates',
 'by',
 '75',
 'basis_points',
 '.']

In [4]:
tkns = [word.lower() for word in tkns if word not in string.punctuation]
tkns

['the',
 'governing_council',
 'decided',
 'to',
 'raise',
 'the',
 'three',
 'key',
 'ecb',
 'interest_rates',
 'by',
 '75',
 'basis_points']

In [5]:
tkns = [word for word in tkns if word not in nltk.corpus.stopwords.words("english")]
tkns

['governing_council',
 'decided',
 'raise',
 'three',
 'key',
 'ecb',
 'interest_rates',
 '75',
 'basis_points']

In [6]:
stemmer = nltk.stem.PorterStemmer()
tkns = [stemmer.stem(word) if word not in bigrams else word for word in tkns]
tkns

['governing_council',
 'decid',
 'rais',
 'three',
 'key',
 'ecb',
 'interest_rates',
 '75',
 'basis_points']

In [7]:
# Example: Create document feature matrix

texts = [
    "The Governing Council decided to raise the three key ECB interest rates by 75 basis points. Following the raising of the deposit facility rate to above zero, the two-tier system for the remuneration of excess reserves is no longer necessary.",
    "The Governing Council decided to raise the three key ECB interest rates by 50 basis points. At the Governing Council’s upcoming meetings, further normalisation of interest rates will be appropriate. The Governing Council’s future policy rate path will continue to be data-dependent and will help to deliver on its 2% inflation target over the medium term"
    ]


In [8]:
# Function to do our pre-processing steps:

def pre_process_text(text, bigrams):
    stemmer = nltk.PorterStemmer()

    for bigram in bigrams:
        text = text.replace(bigram, bigram.replace (" ", "_"))
    bigrams = [bigram.replace(" ", "_") for bigram in bigrams]

    tkns = [word.lower() for word in nltk.word_tokenize(text) if word not in string.punctuation]
    tkns = [word for word in tkns if word not in nltk.corpus.stopwords.words("english")]
    tkns = [stemmer.stem(word) if word not in bigrams else word for word in tkns]
    return tkns



pre_process_text(text, ["Governing Council", "interest rates", "basis points"])   

['governing_council',
 'decid',
 'rais',
 'three',
 'key',
 'ecb',
 'interest_rates',
 '75',
 'basis_points']

In [24]:
# We will use the following bigrams:

bigrams = ["Governing Council", "interest rates", "basis points", " deposit facility", " excess reserves", " policy rate", " inflation target"]

# We also have to replace "Governing Council's" with Governing Council (because the latter is a bigram):
texts = [text.replace ("Governing Council’s", " Governing Council") for text in texts]

# Pre-process
tkns = [pre_process_text(text, bigrams) for text in texts]
tkns


[['governing_council',
  'decid',
  'rais',
  'three',
  'key',
  'ecb',
  'interest_rates',
  '75',
  'basis_points',
  'follow',
  'rais',
  'the_deposit_facil',
  'rate',
  'zero',
  'two-tier',
  'system',
  'remuner',
  'of_excess_reserv',
  'longer',
  'necessari'],
 ['governing_council',
  'decid',
  'rais',
  'three',
  'key',
  'ecb',
  'interest_rates',
  '50',
  'basis_points',
  'governing_council',
  'upcom',
  'meet',
  'normalis',
  'interest_rates',
  'appropri',
  'governing_council',
  'future_policy_r',
  'path',
  'continu',
  'data-depend',
  'help',
  'deliv',
  '2',
  '_inflation_target',
  'medium',
  'term']]

In [25]:
# Get unique tokens and count
all_tkns = set(list(itt.chain.from_iterable(tkns)))

X = np.array([[tkn.count(t) for t in all_tkns] for tkn in tkns])
X

array([[1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 2, 1, 1, 0,
        1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0],
       [1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 3, 1, 1, 1, 1, 0, 1,
        0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 2, 1]])

In [11]:
tfidf = np.log(X.shape[0]) - np.array([np.log(np.sum(x > 0)) for x in X.T]) # iterate over columns
Xstar = X.copy().astype(float)
for (j, w) in enumerate(tfidf):
    Xstar[:, j] *= w
Xstar

# In practice, we don't do this by hand each time -- there are functions in e.g. scikit learn that can construct tfidf matrices for us

array([[0.        , 0.69314718, 0.69314718, 0.        , 0.        ,
        0.69314718, 0.        , 0.69314718, 0.69314718, 0.        ,
        0.        , 0.        , 0.69314718, 0.        , 0.69314718,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.69314718, 0.        , 0.69314718, 0.        , 0.        ,
        0.        , 0.69314718, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.69314718, 0.69314718,
        0.        , 0.69314718, 0.        , 0.        , 0.69314718,
        0.69314718, 0.69314718, 0.        , 0.        , 0.        ,
        0.69314718, 0.69314718, 0.69314718, 0.        , 0.        ,
        0.        , 0.69314718, 0.        , 0.69314718, 0.69314718,
        0.69314718, 0.        , 0.        , 0.        , 0.69314718,
        0.        , 0.        , 0.69314718]])

In [26]:
# Similarity metrics

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

print(cosine_similarity(X[0,:], X[1, :]), " ", cosine_similarity(Xstar[0,:], Xstar[1, :]))      

0.43876345447627835   0.0


In [27]:
# Word embeddings
# nltk.download('word2vec_sample')
word2vec_sample = str(nltk.data.find("models/word2vec_sample/pruned.word2vec.txt"))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary =False)

In [28]:
len(model["university"])

300

In [15]:
model["university"]

array([-6.04623e-02, -4.63157e-02,  6.78263e-02,  1.14723e-01,
        1.06972e-01, -3.41069e-02,  6.74387e-02, -4.88349e-02,
       -1.23541e-02, -7.44151e-02,  4.84473e-02, -5.15480e-02,
        1.26932e-02,  3.08125e-02, -8.64301e-02,  4.38448e-03,
       -8.33294e-02, -3.04249e-02,  7.44151e-02, -5.30983e-02,
       -3.10063e-03,  6.78263e-02,  1.91851e-02, -1.34441e-03,
        1.92578e-03, -8.52673e-02, -5.89120e-02,  1.26932e-02,
       -4.68970e-02,  2.67429e-02,  5.27107e-02, -3.54635e-02,
       -2.28671e-02, -2.25765e-02,  3.21690e-02, -8.29419e-02,
        2.30609e-02,  3.72076e-02, -2.53864e-02,  5.73617e-02,
       -3.93392e-02,  2.06386e-02, -3.27504e-02,  1.53094e-02,
       -2.51926e-02, -3.99206e-02,  1.24025e-02, -4.57343e-02,
       -1.08522e-01,  1.27780e-03,  3.87579e-02, -7.79033e-02,
        6.82139e-02,  2.55802e-02,  9.37941e-02,  1.09491e-02,
       -1.08522e-01,  2.37392e-03,  2.83417e-03, -4.74784e-02,
       -7.90661e-02, -5.27107e-02,  2.36423e-02, -2.829

In [16]:
model.most_similar(positive=["university"], topn=5)

[('universities', 0.7003918290138245),
 ('faculty', 0.6780906915664673),
 ('undergraduate', 0.6587095856666565),
 ('campus', 0.6434988379478455),
 ('college', 0.638526976108551)]

In [17]:
model.most_similar(positive = ["Oslo", "Germany"], negative = ["Berlin"], topn = 1) # Germany − Berlin + Oslo

[('Norway', 0.7291736602783203)]

In [18]:
cosine_similarity(model["Germany"] - model["Berlin"] + model["Oslo"], model["Norway"])

0.72917366

In [19]:
model.most_similar(positive = ["King", "Woman"], negative = ["Man"], topn = 1) # King − Man + Woman

[('Queen', 0.4929390251636505)]