In [31]:
# Import libraries
import pandas as pd
import numpy as np
import gzip
import gensim 
import logging
import os
from gensim.models import Word2Vec
import spacy
import re
from gensim.models import KeyedVectors
import time
import pickle
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# load model package "en_core_web_sm"
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 2797059

In [24]:
file = open("combined.txt",encoding="utf8")


In [140]:
#Load txt file containing all Andy Haldane speeches
a_file = open("combined.txt", encoding="utf8")

string_without_line_breaks = " "
for line in a_file:
    stripped_line = line.replace('\n', ' ')
    string_without_line_breaks += stripped_line
a_file.close()

#print(string_without_line_breaks)

## Pre Processing Data

In [3]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [25]:
#Convert speech to list and remove punctuation
data_corpus = list(sent_to_words(file))

In [26]:
#Creating bigram and trigram models
bigram = gensim.models.Phrases(data_corpus, min_count=3, threshold=5) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_corpus], threshold=3)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)
bigram_list=bigram_mod[data_corpus]
trigram_list=trigram_mod[bigram_mod[data_corpus]]

In [27]:
def remove_stopwords(texts):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [32]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_corpus)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Form Trigrams
data_words_trigrams = make_trigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [34]:
#Save data_lemmatized
with open('data_lemmatized.pickle', 'wb') as f:
    pickle.dump(data_lemmatized, f)

In [35]:
# Load data_lemmatized
with open('data_lemmatized.pickle', 'rb') as f:
    data_lemmatized = pickle.load(f)

## Word2vec Model

In [36]:
# Time calculation
def cal_elapsed_time(s):
    return print("Elapsed time:\t", round((time.time() - s),2))
s_time = time.time()
cal_elapsed_time(s=s_time)

Elapsed time:	 0.0


In [37]:
s_time = time.time()
print("Model Training Started...")
w2v_model = Word2Vec(min_count=5,
                                 window=100,
                                 vector_size=150,
                                 sample=0,
                                 workers=4,
                                 batch_words=100)

w2v_model.build_vocab(data_lemmatized)
w2v_model.train(data_lemmatized, total_examples=w2v_model.corpus_count, total_words=w2v_model.corpus_total_words, epochs=100, compute_loss=True)

cal_elapsed_time(s_time)

Model Training Started...
Elapsed time:	 52.09


In [38]:
# Save and load word2vec model
w2v_model.save("Speech2vec.w2v_model")
speech2vec = KeyedVectors.load("Speech2vec.w2v_model", mmap='r')

In [39]:
print("Total number of unique words loaded in Model : ", len(speech2vec.wv))


Total number of unique words loaded in Model :  3641


In [175]:
risky_list=speech2vec.wv.most_similar(positive="risky", topn=10)
risky_list

[('risk_illusion', 0.46636998653411865),
 ('obtain', 0.4055613875389099),
 ('macro_state', 0.39641353487968445),
 ('internalise', 0.35571256279945374),
 ('bad', 0.3484610915184021),
 ('risky_policie', 0.3369133174419403),
 ('tradable', 0.3286990523338318),
 ('amount', 0.3216891288757324),
 ('positive', 0.31588131189346313),
 ('spillover', 0.30914542078971863)]

In [157]:
Risk_word=[]
for a in risky_list:
        Risk_word.append(a[0])

In [158]:
Risk_word

['risk_illusion',
 'obtain',
 'macro_state',
 'internalise',
 'bad',
 'risky_policie',
 'tradable',
 'amount',
 'positive',
 'spillover']

In [44]:
#print bigram list
bigramlist=re.findall(r"\b\w+_\w+\b",str(data_lemmatized))
set(bigramlist);

In [195]:
#Find bigrams which contains the word "risk" 
letters = 'risk'
bigramtxt=[]
for word in bigramlist:
    if letters in word:
        bigramtxt.append(word)
bigramtxt=set(bigramtxt)
bigramtxt

{'bearing_risk',
 'counterparty_risk',
 'downside_risk',
 'liquidity_risk',
 'risk_adjuste',
 'risk_appetite',
 'risk_aversion',
 'risk_bucket',
 'risk_free',
 'risk_illusion',
 'risk_management',
 'risk_manager',
 'risk_model',
 'risk_premium',
 'risk_sensitivity',
 'risk_share',
 'risk_shifte',
 'risk_shifting',
 'risk_take',
 'risk_taker',
 'risk_taking',
 'risk_weight',
 'risk_weighte',
 'risks_face',
 'risky_asset',
 'risky_policie',
 'risky_policy',
 'risky_strategie',
 'risky_tight',
 'set_risky',
 'specific_risk',
 'systemic_risk',
 'tail_risk'}

In [46]:
#Remove underscore in list
bitxt = [item.replace("_", " ") for item in bitxt]
bitxt

['risky policy',
 'risk take',
 'tail risk',
 'risk shifte',
 'risk bucket',
 'risk sensitivity',
 'set risky',
 'risk model',
 'specific risk',
 'risk appetite',
 'risky strategie',
 'risky tight',
 'risk weight',
 'risks face',
 'liquidity risk',
 'risk taker',
 'risk illusion',
 'systemic risk',
 'risk adjuste',
 'risk aversion',
 'risk taking',
 'downside risk',
 'counterparty risk',
 'risk manager',
 'risk premium',
 'bearing risk',
 'risky asset',
 'risk management',
 'risk weighte',
 'risk shifting',
 'risky policie',
 'risk free',
 'risk share']

In [198]:
list(bigramtxt)

['risky_policy',
 'risk_take',
 'tail_risk',
 'risk_shifte',
 'risk_bucket',
 'risk_sensitivity',
 'set_risky',
 'risk_model',
 'specific_risk',
 'risk_appetite',
 'risky_strategie',
 'risky_tight',
 'risk_weight',
 'risks_face',
 'liquidity_risk',
 'risk_taker',
 'risk_illusion',
 'systemic_risk',
 'risk_adjuste',
 'risk_aversion',
 'risk_taking',
 'downside_risk',
 'counterparty_risk',
 'risk_manager',
 'risk_premium',
 'bearing_risk',
 'risky_asset',
 'risk_management',
 'risk_weighte',
 'risk_shifting',
 'risky_policie',
 'risk_free',
 'risk_share']

In [201]:
Risk_word_tuple =[]

for word in list(bigramtxt):
    try:
        topN=speech2vec.wv.most_similar(word, topn=10)
        Risk_word_tuple.append(topN)
    except:
        pass
        

In [204]:
Risk_word_list=[]
for a in Risk_word_tuple:
    for b in a:
        Risk_word_list.append(b[0])

In [208]:
sentences = [sentence for sentence in string_without_line_breaks.split(".") 
             if any(w.lower() in sentence.lower() for w in bitxt)]

print(sentences)



In [207]:
with open('Risklist_Insight.txt', 'w',encoding='utf-8') as f:
    for line in sentences:
        f.write(line)
        f.write('\n')

In [55]:
import numpy as np  
import pandas as pd 
import re           
from bs4 import BeautifulSoup 
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.python.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.callbacks import EarlyStopping
import warnings
pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx

Using TensorFlow backend.


## TextRank

In [161]:
with open('Risky_Insight.txt',encoding="utf8") as f:
    lines = f.readlines()
    Insight_text = [item.replace("\n", ".") for item in lines]


In [162]:
from nltk.tokenize import sent_tokenize
sentence = []
for s in str(Insight_text):
    sentence.append(sent_tokenize(s))

#sentence = [y for x in sentence for y in x] # flatten list

In [163]:
Insight_text[1]

'  The Effects on Productivity and Output Given these shifts, some of which seem likely to prove durable, what impact might they have had on workers’ and businesses’ economic contribution – that is to say, their productivity (the amount done per hour worked) and their overall output (productivity multiplied by working hours)? In short, how has this shift in working practices affected working capacity of the economy? A number of empirical studies have looked at the effects of home working on productivity.'

In [164]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.strip().split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [165]:
len(word_embeddings)

399881

In [166]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(Insight_text).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [167]:
# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [168]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [169]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.strip().split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [170]:
sentence_vectors = []
for i in clean_sentences:
    if len(i) != 0:
        v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
    else:
        v = np.zeros((100,))
        sentence_vectors.append(v)

In [171]:
# similarity matrix

similarity_matrix = np.zeros([len(sentences), len(sentences)])
for i,row_embedding in enumerate(sentence_vectors):
    for j,column_embedding in enumerate(sentence_vectors):
        similarity_matrix[i][j]=1-spatial.distance.cosine(row_embedding,column_embedding)

In [172]:
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)

In [173]:
#top 10 sentences
top_sentence={sentence:scores[index] for index,sentence in enumerate(sentences)}
top=dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:10])

In [174]:
for sent in sentences:
    if sent in top.keys():
        print(sent)

 There are both positives and negatives from the shift in working practices that has taken place this year and the balance of these is likely itself to shift over time
  The Effects on Productivity and Output Given these shifts, some of which seem likely to prove durable, what impact might they have had on workers’ and businesses’ economic contribution – that is to say, their productivity (the amount done per hour worked) and their overall output (productivity multiplied by working hours)? In short, how has this shift in working practices affected working capacity of the economy? A number of empirical studies have looked at the effects of home working on productivity
 Even if the amount workers produce each hour has fallen, as evidence and anecdote tentatively suggests, this need not imply workers’ overall economic contribution has fallen
 For example, studies have found that working from home is positively associated with perceived autonomy and this, in turn, has positive spillover ef