# Load Libraries

In [1]:
import pickle
import pandas as pd
import numpy as np
import sys
import os

In [2]:
!{sys.executable} -m pip install contextualized-topic-models
!{sys.executable} -m pip install pyldavis

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
from contextualized_topic_models.models.ctm import ZeroShotTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessingStopwords
from contextualized_topic_models.evaluation.measures import CoherenceNPMI,TopicDiversity,CoherenceUMASS,InvertedRBO
import pyLDAvis as vis

In [4]:
SEED = 42
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

# Load Models

They are already trained, here they are just used for visualization

In [5]:
training_sets = []
tp_models = []
test_sets = []
for i in range(8):
    with open('./Training_sets/training_set'+str(i)+'.pickle', 'rb') as dataset:
        dataseti = pickle.load(dataset)
        training_sets.append(dataseti)
    with open('./Test_sets/test_set'+str(i)+'.pickle', 'rb') as dataset:
        dataseti = pickle.load(dataset)
        test_sets.append(dataseti)
    with open('./tp_models/tp_model'+str(i)+'.pickle', 'rb') as tpmodel:
        tpmodeli = pickle.load(tpmodel)
        tp_models.append(tpmodeli)


In [6]:
models = []
for i in range(8):
    ctm = ZeroShotTM(bow_size=len(tp_models[i].vocab), contextual_size=768, n_components=10, num_epochs=20)
    ctm.load("./ctm_models"+str(i)+"/contextualized_topic_model_nc_10_tpm_0.0_tpv_0.9_hs_prodLDA_ac_(100, 100)_do_softplus_lr_0.2_mo_0.002_rp_0.99",
                                                                                                          epoch=19)
    models.append(ctm)



In [7]:
dict_dec = {}
count = 0
for i in range(1720,1800,10):
    dict_dec[i] = count
    count +=1

In [8]:
dict_dec

{1720: 0, 1730: 1, 1740: 2, 1750: 3, 1760: 4, 1770: 5, 1780: 6, 1790: 7}

In [9]:
#Insert a decade from 1720 to 1790 to see the topic visualization and distribution. There are just 10 components per decade
def visualize_decade(decade):
    index = dict_dec[decade]
    lda_vis_data = models[index].get_ldavis_data_format(tp_models[index].vocab, training_sets[index],500)
    decade_pd = vis.prepare(**lda_vis_data)
    return vis.display(decade_pd)

In [10]:
visualize_decade(1780)

  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid 

In [11]:
#This is my analysis, (sebastian), about what I think it means each topic, in each model.
# NA - > The topic only contains weird tokens, doesn't represent anything
topics_names = {1920:{1:'Christianity',2:'Goverment',3:'Romantic?/Theatrical?',4:'?',5:'Big Concepts',6:'Law',7:'',8:'Kings/England',9:'Verbs?',10:''},
                1930:{1:'Religion',2:'Goverment/Kingdom',3:'',4:'Books/Ideas',5:'Law/State',6:'Romantic?/Theatrical?',7:'Earth/Nature',8:'Locations/People',9:'Kings/England',10:''},
                1940:{1:'Religion',2:'',3:'Religion/Bible',4:'Man/Ideas',5:'War/Government',6:'',7:'',8:'Law',9:'Kings/England',10:'NA?'},
                1950:{1:'Bible/Israel',2:'Romantic?/Theatrical?/Art?',3:'Religion',4:'Earth/Nature',5:'Kingdom',6:'Man/Ideas',7:'Kings/England',8:'Locations/People',9:'NA',10:'NA'},
                1960:{1:'Bible/Religion',2:'People/Titles',3:'Religion',4:'Kings/England',5:'Royalty/Family',6:'Government/Law',7:'',8:'Earth/Nature',9:'Ideas/Knowledge',10:''},
                1970:{1:'Government/War',2:'Law/Economy',3:'Religion',4:'',5:'',6:'Man/Ideas',7:'Kings/Royalty/England',8:'People/Titles',9:'Church/Religion',10:'NA'},
                1980:{1:'',2:'Government/War',3:'',4:'Ideas/Moral/Knowledge',5:'Religion',6:'Law',7:'Economy',8:'Government/Titles',9:'',10:'NA'},
                1990:{1:'',2:'',3:'Law',4:'Religion',5:'Government/War',6:'Ideas/Moral/Knowledge',7:'Family/People',8:'',9:'Countries/Land/Economy',10:'NA'}}

# Metrics Evaluation

In [12]:
for decade in range(1720,1800,10):
    i = dict_dec[decade]
    tpd = TopicDiversity(topics=models[i].get_topic_lists(10))
    inv_rbo = InvertedRBO(topics=models[i].get_topic_lists(10))
    scorei = tpd.score(topk=10)
    invscore = inv_rbo.score()
    print("For the decade {} the topic diversity score is {} and the inverse rbo is {}".format(decade,scorei,invscore))

For the decade 1720 the topic diversity score is 0.89 and the inverse rbo is 0.9768115433577778
For the decade 1730 the topic diversity score is 0.91 and the inverse rbo is 0.98406192262
For the decade 1740 the topic diversity score is 0.89 and the inverse rbo is 0.9784766743814286
For the decade 1750 the topic diversity score is 0.89 and the inverse rbo is 0.97787313116
For the decade 1760 the topic diversity score is 0.93 and the inverse rbo is 0.9879985513385714
For the decade 1770 the topic diversity score is 0.93 and the inverse rbo is 0.9816403667463492
For the decade 1780 the topic diversity score is 0.91 and the inverse rbo is 0.9806437121977778
For the decade 1790 the topic diversity score is 0.95 and the inverse rbo is 0.9892895285071428


In [13]:
training_sets[0].__getitem__(1)

{'X_bow': tensor([[0., 0., 0.,  ..., 0., 0., 0.]]),
 'X_contextual': tensor([-6.1380e-04, -1.0519e-01, -5.9324e-03,  1.4345e-01,  1.1806e-02,
          8.7712e-03, -2.1415e-02,  2.3226e-02,  2.0015e-01,  1.2212e-01,
          3.7409e-02, -9.0113e-02,  6.1101e-03, -2.6023e-01,  2.0544e-02,
         -7.8548e-02,  2.6290e-02, -7.1604e-02, -1.5225e-02,  2.5416e-02,
          1.1830e-01, -2.2116e-02,  4.0804e-02, -2.4785e-02, -1.2023e-01,
          1.0570e-01,  7.4286e-02,  7.2395e-02,  3.2210e-02,  1.2866e-02,
          7.1789e-03,  4.8495e-02,  1.4543e-01, -1.4934e-02,  2.1313e-02,
         -5.5876e-02, -8.6309e-02, -9.0029e-03, -7.9649e-02,  2.7410e-02,
         -9.9994e-02, -1.5306e-01,  3.1305e-02,  8.9551e-03, -8.7678e-02,
          6.1794e-02, -2.2437e-02,  1.4301e-01,  7.9974e-02,  7.7013e-02,
          3.1987e-02,  4.5722e-02, -1.4484e-01,  4.1722e-02,  1.3018e-01,
         -2.9368e-01, -1.7529e-02,  3.3766e-02, -9.9670e-03,  1.3986e-01,
         -2.2812e-02, -1.0265e-01, -4.0806e-

In [14]:
[str("The king and the queen").split()]

[['The', 'king', 'and', 'the', 'queen']]

In [15]:
test_sets

[<contextualized_topic_models.datasets.dataset.CTMDataset at 0x7f81806c5c60>,
 <contextualized_topic_models.datasets.dataset.CTMDataset at 0x7f81806c7700>,
 <contextualized_topic_models.datasets.dataset.CTMDataset at 0x7f81806c6860>,
 <contextualized_topic_models.datasets.dataset.CTMDataset at 0x7f81806c4a00>,
 <contextualized_topic_models.datasets.dataset.CTMDataset at 0x7f81806c6d70>,
 <contextualized_topic_models.datasets.dataset.CTMDataset at 0x7f81806c5450>,
 <contextualized_topic_models.datasets.dataset.CTMDataset at 0x7f81806c5600>,
 <contextualized_topic_models.datasets.dataset.CTMDataset at 0x7f81806c57b0>]

In [16]:
models[0].get_topic_lists()

[['king',
  'robert',
  'earl',
  'efq',
  'john',
  'henry',
  'thomas',
  'william',
  'george',
  'sir'],
 ['ham',
  'depart',
  'answered',
  'darkness',
  'accord',
  'worship',
  'flock',
  'accept',
  'repent',
  'condemn'],
 ['king',
  'great',
  'time',
  'people',
  'france',
  'prince',
  'war',
  'parliament',
  'army',
  'england'],
 ['love',
  'sir',
  'fee',
  'madam',
  'thy',
  'heav',
  'heart',
  'dear',
  'fate',
  'thou'],
 ['court',
  'cafe',
  'money',
  'pay',
  'paid',
  'defendant',
  'plaintiff',
  'law',
  'goods',
  'estate'],
 ['thy',
  'thou',
  'lord',
  'thee',
  'god',
  'thall',
  'hath',
  'art',
  'thine',
  'haft'],
 ['men',
  'religion',
  'man',
  'force',
  'true',
  'good',
  'thing',
  'reason',
  'things',
  'power'],
 ['depart',
  'answered',
  'supper',
  'maker',
  'ham',
  'accord',
  'beloved',
  'destroy',
  'wives',
  'sick'],
 ['god',
  'faith',
  'law',
  'christ',
  'gentiles',
  'gospel',
  'things',
  'life',
  'jews',
  'paul'],


In [17]:
test_sets[0].__getitem__(1)

{'X_bow': tensor([[0.]]),
 'X_contextual': tensor([-1.0904e-01, -2.9339e-01, -4.1732e-03,  5.7644e-02,  1.7722e-02,
          4.3258e-02, -1.0997e-01,  5.1512e-02,  1.9279e-01,  8.5702e-03,
         -1.3188e-01, -1.1227e-01,  1.4261e-01, -3.9253e-02, -8.3954e-02,
         -4.4456e-02, -2.2035e-02,  6.5177e-03, -2.9760e-02, -5.4112e-02,
          4.5642e-02,  2.1994e-02,  8.8462e-02,  9.1474e-02,  4.8693e-02,
          3.2454e-02, -1.1171e-01,  2.0034e-01,  8.7845e-02,  1.1436e-01,
          2.2805e-02,  4.6724e-02,  2.0878e-03, -6.5472e-02,  3.0954e-02,
         -1.9171e-02, -1.1101e-01,  3.4602e-03, -1.0040e-01, -7.7204e-02,
         -2.7136e-03, -2.0609e-01, -1.1923e-02,  4.8829e-02, -2.1711e-02,
         -2.4770e-01, -1.5633e-02, -2.3917e-02,  6.8454e-02,  1.1511e-02,
         -6.0904e-02,  6.8350e-02, -1.4830e-01,  8.8643e-02,  2.9541e-01,
         -1.1435e-01, -1.6239e-02, -2.3581e-02, -3.3960e-02, -1.5386e-01,
         -1.3477e-01, -3.0683e-02, -4.2210e-03,  6.1339e-02, -1.5319e-

In [18]:
pred_model0 = models[0].get_thetas(test_sets[0], n_samples=20) 

  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid 

In [19]:
texts = [["THE Muses are the mofl Coqtettihf of their Sex, fond of being admir\'d, and always putting on their best Airs to the finest Gentleman: But alas, Sir! Their Addrefes are jfale,- andtheir fine Things but Repetition ;for there is nothing newin tit, but what isfoundinjour.own Converfatien.CGO\'D I write bythe Help of Study, as you talk with-eNt it, I wou\'d-venture tofayfomething in the usual Strain of Dedication ;bui as you have too much Wit tofafisr it, awi Itoo little to tunertakeil, !hopetheWorld will excuf my Dfficiency, andyou ziwil pardon the Presumption of,"]]

In [20]:
npmi = CoherenceNPMI(texts=texts, topics=models[0].get_topic_lists())
npmi.score()

ValueError: unable to interpret topic as either a list of tokens or a list of ids