In [16]:
import sys
import json
from nltk.data import find
import gensim
import numpy as np
from scipy import spatial
from sqlitedict import SqliteDict
db = SqliteDict("Hansard.sqlite")
from wordcloud import STOPWORDS
stopwords = set(STOPWORDS)
STOPWORDS.add('(b)')
STOPWORDS.add('(a)')
STOPWORDS.add('(c)')
extra_stopwords = ['','b', 'will','ask','make','use', 'state','secretary','hon','members','minister','take','say','many','point','statement','go','(a)','may','great','give','one','years','people','right','member','government','house','need','much','friend','come','()','now','mean','reduce','agree','us','(c)','mr','word','want','tell','end','whether','two','£','%','see','gentleman','put','long','yet']
stopwords = set(STOPWORDS)
new_stopwords = stopwords.copy()
for w in extra_stopwords:
    new_stopwords.add(w)
stopwords = new_stopwords

In [17]:
data = db[10505]
speeches = data['speeches']


In [18]:
import string
from nltk.stem import WordNetLemmatizer
import re
def preprocess_speeches(speeches):
    #Removes all punctuation
    text = [t.translate(str.maketrans('', '', string.punctuation)) for t in speeches]
    #Removes text inside parenthesis
    text = [re.sub(r'\d+', '', t) for t in text]
    #Converts text into lowercase, and splits by whitespace
    text = [t.lower() for t in text]
    return text

In [19]:
text = preprocess_speeches(speeches)

In [20]:
#reading in vectors database
word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)


In [21]:
def norm(vec):
    #returns the magnitude of the vector
    return np.linalg.norm(vec)

def similarity(vec1, vec2):
    #returns the cosine similarity of the vector
    if norm(vec1) < 0.1 or norm(vec2) < 0.1:
        return 0.0
    return 1.0 - spatial.distance.cosine(vec1, vec2)


def sentence2vec(sentence, model, stopwords):
    #maps every words to a vector, sums the vectors, then normalises
    v = model['house'] - model['house']
    cnt = 0
    for w in sentence.split(' '):
        if model.has_index_for(w) and not w in stopwords:
            v += model.get_vector(w)
            cnt += 1
    if cnt > 0:
        v /= norm(v)
    return v

def text2vectors(text,model,stopwords):
    #converts the list of strings to a list of vectors
    vectors = [sentence2vec(t,model,stopwords) for t in text]
    return vectors
  

In [22]:
vectors = text2vectors(text,model,stopwords)

In [23]:
def search(words,model,vectors,stopwords):
    v = sentence2vec(words, model, stopwords)
    scores = [similarity(v,dv) for dv in vectors]
    idxs = np.argsort(scores)[::-1][:16]
    return idxs

In [75]:
idxs = search("big huge lemons", model, vectors, stopwords)
print(idxs)

[ 647  657 1902 1056  429 3788  400 3530  839 4905 1180  577 2508 1575
 4763  740]


In [76]:
speeches[idxs[0]]

'My hon. Friend is absolutely right. For a prosperous country—we are supposed to be the fifth largest economy in the world—that is a small amount to be asked to pay, but it has an enormous impact across the world.'

In [80]:
from bertopic import BERTopic
from umap import UMAP

In [96]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=15, 
                  n_components=5, 
                  min_dist=0.0, 
                  metric='cosine', 
                  random_state=100)
# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True, nr_topics=11)

In [101]:
text0 = [' '.join([w for w in t.split() if not w in stopwords]) for t in text]
text0 = [t for t in text0 if len(t.split()) > 25]
print(len(text0))

1240


In [102]:
# Run BERTopic model
topics, probabilities = topic_model.fit_transform(text0)

In [103]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,145,-1_tewkesbury_time_constituency_site
1,0,615,0_northern_ireland_bill_amendment
2,1,146,1_houses_flood_tewkesbury_water
3,2,83,2_racing_tote_race_sport
4,3,71,3_ethiopia_aid_country_countries
5,4,63,4_schools_school_special_education
6,5,50,5_energy_nuclear_electricity_industry
7,6,32,6_health_hospital_nhs_gloucestershire
8,7,23,7_pubs_drugs_pub_pubcos
9,8,12,8_slaughter_animal_animals_meat


In [99]:
topic_model.visualize_topics()