<a href="https://colab.research.google.com/github/DivyaRustagi10/contextualized-topic-models-ssl/blob/main/ZeroshotTM_For_Same_Script_Languages.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Zero-shot Cross-Lingual Topic Modeling For Same Script Languages
> Do conxtextualized TM tackle zero-shot cross-lingual topic modeling better on same script languages?

We use 4000 documents as training and consider randomly sampled 800 documents as the test set. We collect the 800 respective instances in [LANGUAGES].

First, we use IndicBERT to generate multilingual embeddings as the input of the model. Then we evaluate multilingual topic predictions on the multilingual abstracts in test set.

In [1]:
# Install the contextualized topic model library
%%capture
!pip install -U contextualized_topic_models

In [2]:
%%capture
!pip install pyldavis
!pip install wget
!pip install head
!nvidia-smi

In [3]:
# Imports
from contextualized_topic_models.models.ctm import ZeroShotTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import  WhiteSpacePreprocessingStopwords
import pickle

## Data

###**Building PMIndia Corpus**

Below contains the code for creating a parallel corpus from the website of the Indian Prime Minister (www.pmindia.gov.in). 

We combine each speech document into one, for every language. Datasets are downloaded from [Statistical Machine Translation](https://data.statmt.org/pmindia/v1/monolingual/).

*Downloading Dataset*

In [4]:
# Imports
import urllib
import wget
import pandas as pd
import os

# Download PMIndia Datasets
FILES_DIR = os.getcwd() # REPLACE WITH YOUR DIRECTORY IF YOU PREFER DOWNLOADING IN SPECIFIC DIRECTORY
LINK = "https://drive.google.com/u/0/uc?id=1IqH2XQFw1XHPT2Sh_Yz3LnrMEVqS8oef&export=download"  # LINK TO PMINDIA FILE LOCATION
FILE_PATH = os.getcwd() + "/" + wget.download(LINK) # PATH TO DOWNLOADS

pmindia_list = pd.read_csv(FILE_PATH, sep = ",",  names = ["lang", "link"], engine = "python" )['link']
pmindia_list = [FILES_DIR + "/" + wget.download(link.strip(" ")) for link in pmindia_list]

In [5]:
# Build directory to store speech files
%%bash
mkdir parallel_speeches
cd parallel_speeches

In [6]:
# Files stored in content directory
# Data will need to be re-downloaded if a session closes
''' Following script will download parallel corpus into new directory named parallel_speeches.
Each folder in parallel_speeches contains pmindia speeches in indic language identified by their ISO code.

For e.g.
parallel_speeches/as/ contains speeches in Assamese.
parallel_speeches/hi contains speeches in Hindi.
'''
import tarfile
import os

FILES_DIR = os.getcwd() # REPLACE WITH DIRECTORY OF FILES
STORE_FILES_HERE = os.getcwd() + "/parallel_speeches" # STORE EXTRACTED FILES HERE

for fname in pmindia_list:
    tar = tarfile.open(fname, "r:gz") # unzip file
    tar.extractall(STORE_FILES_HERE)    
    foldername = fname[fname.rfind('.tgz')-2: fname.rfind('.tgz')] # get folder name
    os.rename(STORE_FILES_HERE + "/split", STORE_FILES_HERE + "/" + foldername) # rename default split folder to language name
    tar.close()

*Get Parallel Speeches*

In [7]:
import glob # SEEK FILES FROM ABOVE FOR DOWNLOAD
FOLDERS_DIR = os.getcwd() + "/parallel_speeches"  # DIRECTORY FOR SAVING PARALLEL SPEECHES

# Stores list of speeches in dictionary keyed by ISO language name
SPEECHES_IN_LANGS = {languagefolder[-2:] : sorted(list(glob.glob(languagefolder + "/*.txt")))[3:] for languagefolder in glob.glob(FOLDERS_DIR + "/*")}

In [8]:
# Get file names to find common test corpus (set of files common between all languages)
STRIP_INDEX = SPEECHES_IN_LANGS['as'][0].index('as') + len('as/')   # GET LANGUAGE ISO CODE

filenames = {}
for lang in SPEECHES_IN_LANGS.keys():
  temp = [item[STRIP_INDEX:] for item in SPEECHES_IN_LANGS[lang]]
  filenames[lang] = temp

*Combine Parallel Speeches Into Corpus*

In [9]:
# Setting seed for reproducibility
import random
random.seed(210)

# Select 800 random corpus
sample_corpus = sorted(random.sample(list(set.intersection(*map(set, filenames.values()))), 800))
sample_corpus

['15th-edition-of-pravasi-bharatiya-diwas-inaugurated-by-pm-in-varanasi.txt',
 'ambassador-ms-nikki-haley-united-states-permanent-representative-to-the-united-nations-calls-on-pm.txt',
 'anganwadi-workers-from-across-the-country-call-on-pm.txt',
 'asha-representatives-from-across-the-country-call-on-pm.txt',
 'beneficiaries-of-pradhan-mantri-mudra-yojana-share-their-success-stories-during-interaction-with-pm.txt',
 'beneficiaries-of-ujjwala-yojana-interact-with-pm.txt',
 'boost-to-indias-space-program.txt',
 'cabinet-apprised-of-agreement-between-india-and-uzbekistan-on-cooperation-in-the-field-of-science-technology-and-innovation.txt',
 'cabinet-apprised-of-an-mou-signed-between-india-and-sri-lanka-for-promoting-cooperation-in-the-field-of-information-technology-and-electronics.txt',
 'cabinet-apprised-of-framework-agreement-between-india-and-sao-tome-principe-for-cooperation-in-the-exploration-and-uses-of-outer-space-for-peaceful-purposes.txt',
 'cabinet-apprised-of-implementing-arra

In [10]:
# Combine selected 800 sample speeches into one document for each lang 
parallel_speeches = {}    # STORE TEST SPEECHES
train_speeches = {}       # STORE TRAIN SPEECHES

for lang, speeches in SPEECHES_IN_LANGS.items(): 
  each_lang = []         # list of speeches for one language at a time
  just_train = []
  
  for speech_file in speeches:                          # access list of files in speeches for one language at a time
    with open(speech_file, 'r') as speech:              # read file
      speech = " ".join([str(line) for line in speech]) # each speech file becomes one string

      if speech_file[STRIP_INDEX:] in sample_corpus:
        each_lang.append(speech)                        # append string version of speech file
      else:
        just_train.append(speech)                       # add to train set

  parallel_speeches[lang] = each_lang                   # add list of speeches for every language
  train_speeches[lang] = just_train                     # add to train set

*Split Parallel Corpus Into Test and Train*

In [11]:
# Imports
import pandas as pd
from pprint import pprint
import re

# Selecting Train speeches
hindi_unprep = pd.DataFrame(list for list in train_speeches['hi'])
english_unprep = pd.DataFrame(list for list in train_speeches['en'])

# We select speeches with at least 500 tokens
TOKENS_LIMIT = 500
remove = []
for speech in hindi_unprep[:5].itertuples():
  if len(speech[1].split(" "))  < TOKENS_LIMIT:
    remove.append(speech[0])
    print(speech[0], " removed!")

file_name = 'pmindia_hindi_unprep.txt' # SAVE FILE AS

# Save trainset - not used
with open(file_name, 'w', encoding = "utf-8") as f:     
  f.writelines("%s" % str(line)+"\t" for line in train_speeches['hi'])

x = open(file_name, "r", encoding = "utf-8")

# View each row as a speech
hindi_unprep[:5]

0  removed!
1  removed!
2  removed!
3  removed!


Unnamed: 0,0
0,नागरिकों से स्‍वच्‍छाग्रही बनने और स्‍वच्‍छ भा...
1,श्री सोमनाथ न्‍यास के न्‍यासियों की 116वीं बैठ...
2,प्रधानमंत्री श्री नरेन्द्र मोदी की अध्यक्षता म...
3,प्रधानमंत्री श्री नरेन्द्र मोदी ने आज ही के दि...
4,· खूंटी की जिला अदालत में छत पर लगने वाले सौर ...


## Training Models

###**Hindi-Based ZeroshotTM**

*Preprocessing*

Why do we use the preprocessed text here? We need text without punctuation to build the bag of word. Also, we might want only to have the most frequent words inside the BoW. Too many words might not help.

In [None]:
### HINDI ###
LANG_SELECTED = 'hi'

# We select 500 tokens per speech
NUM_TOKENS = 200 

# Download Hindi Stopwords
!pip install stopwordsiso
import stopwordsiso as stopwords

# Run preprocessing script
documents = [line[:NUM_TOKENS].strip() for line in train_speeches[LANG_SELECTED]]
sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list = stopwords.stopwords(LANG_SELECTED))
preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()

# Ensure same length for preprocessed and unpreprocessed
print(len(preprocessed_documents), len(unpreprocessed_corpus))

We don't discard the non-preprocessed hindi texts, because we are going to use them as input for obtaining the **contextualized** document representations. We will now pass our files with preprocess and unpreprocessed data to our TopicModelDataPreparation object. This object takes care of creating the bag of words and obtains the contextualized BERT representations of documents. This operation allows us to create our training dataset.

Note: Here we use the contextualized model "ai4bharat/indic-bert", because we need a multilingual model for indic languages for performing cross-lingual predictions later.



*Training ZeroshotTM*

In [13]:
# Imports
from contextualized_topic_models.models.ctm import ZeroShotTM
import contextualized_topic_models.utils.data_preparation
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
import nltk
import pickle

In [62]:
from sentence_transformers import SentenceTransformer
import numpy as np

def bert_embeddings_from_file(text_file, sbert_model_to_load, custom=False, tokenizer_args=None, model_args=None, batch_size=200):
    """
    Creates SBERT Embeddings from an input file
    """
    from sentence_transformers import SentenceTransformer, models

    # For indicBERT
    if sbert_model_to_load == 'ai4bharat/indic-bert':
      custom = True
      tokenizer_args = {"keep_accents": True}

    # If build custom sentence transformer model
    if custom:
      if tokenizer_args and model_args:   # both set of arguments given in dict to pass into hugging face model
        word_embedding_model = models.Transformer(sbert_model_to_load, 
                                                  tokenizer_args=tokenizer_args, 
                                                  model_args = model_args)
      
      elif tokenizer_args:   # tokenizer arguments given in dict to pass into hugging face model
        word_embedding_model = models.Transformer(sbert_model_to_load, tokenizer_args = tokenizer_args)

      elif model_args:   # model arguments given in dict to pass into hugging face model
        word_embedding_model = models.Transformer(sbert_model_to_load, model_args = model_args)
      
      else:
        word_embedding_model = models.Transformer(sbert_model_to_load)

      # Pass modules to build sentence transformer model
      pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
      model = SentenceTransformer(modules=[word_embedding_model, pooling_model])   
    
    # Else retrieve model from hugging face
    else:
        model = SentenceTransformer(sbert_model_to_load)

    with open(text_file, encoding="utf-8") as filino:
        train_text = list(map(lambda x: x, filino.readlines()))

    return np.array(model.encode(train_text, show_progress_bar=True, batch_size=batch_size))


def bert_embeddings_from_list(texts, sbert_model_to_load, custom=False, tokenizer_args=None, model_args=None, batch_size=200):
    """
    Creates SBERT Embeddings from a list
    """
    from sentence_transformers import SentenceTransformer, models

    # For indicBERT
    if sbert_model_to_load == 'ai4bharat/indic-bert':
      custom = True
      tokenizer_args = {"keep_accents": True}

    # If build custom sentence transformer model
    if custom:
      if tokenizer_args and model_args:   # both set of arguments given in dict to pass into hugging face model
        word_embedding_model = models.Transformer(sbert_model_to_load, 
                                                  tokenizer_args=tokenizer_args, 
                                                  model_args = model_args)
      
      elif tokenizer_args:   # tokenizer arguments given in dict to pass into hugging face model
        word_embedding_model = models.Transformer(sbert_model_to_load, tokenizer_args = tokenizer_args)

      elif model_args:   # model arguments given in dict to pass into hugging face model
        word_embedding_model = models.Transformer(sbert_model_to_load, model_args = model_args)
      
      else:
        word_embedding_model = models.Transformer(sbert_model_to_load)

      # Pass modules to build sentence transformer model
      pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
      model = SentenceTransformer(modules=[word_embedding_model, pooling_model])   
    
    # Else retrieve model from hugging face
    else:
        model = SentenceTransformer(sbert_model_to_load)

    return np.array(model.encode(texts, show_progress_bar=True, batch_size=batch_size))

contextualized_topic_models.utils.data_preparation.bert_embeddings_from_file = bert_embeddings_from_file
contextualized_topic_models.utils.data_preparation.bert_embeddings_from_list = bert_embeddings_from_list


In [68]:
sbert_model_to_load = 'ai4bharat/indic-bert'
tokenizer_args = {"keep_accents": True}
model_args = {}

word_embedding_model = models.Transformer(sbert_model_to_load, 
                                                  tokenizer_args=tokenizer_args, 
                                                  model_args = model_args)

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['predictions.dense.bias', 'sop_classifier.classifier.bias', 'sop_classifier.classifier.weight', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.dense.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [69]:
print(model.tokenizer.tokenize("यहाँ") == model.tokenizer.tokenize("यह"))

False


In [None]:
# Load Indic Multilingual embeddings 
tp = TopicModelDataPreparation('ai4bharat/indic-bert')

# Building training dataset
training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

In [None]:
# Train over 100 epochs

### HINDI : 25 TOPICS ###
z_ctm_25_HI = ZeroShotTM(bow_size=len(tp.vocab), n_components = 25, contextual_size=768, num_epochs=100)

z_ctm_25_HI.fit(training_dataset, n_samples = 100) # run the model
z_ctm_25_HI.save("./") # save the model

# ### HINDI : 50 TOPICS ###
z_ctm_50_HI = ZeroShotTM(bow_size=len(tp.vocab), n_components = 50, contextual_size=768, num_epochs=100)

z_ctm_50_HI.fit(training_dataset, n_samples = 100) # run the model
z_ctm_50_HI.save("./") # save the model

In [66]:
# See topic predictions per speech doc
z_ctm_25_HI.get_topic_lists(5)

[['दश', 'बजट', 'करन', 'रलव', 'सतर'],
 ['उठ', 'उच', 'तपस', 'नह', 'बढग'],
 ['मर', 'मच', 'कदर', 'जय', 'बहत'],
 ['बहत', 'मन', 'हम', 'मझ', 'पहल'],
 ['नरनदर', 'हए', 'आज', 'नई', 'करत'],
 ['एव', 'परबधन', 'सरकषण', 'तहत', 'जल'],
 ['कषतर', 'एमओय', 'एव', 'पतर', 'परयटन'],
 ['करग', 'उच', 'उठ', 'तपस', 'बढग'],
 ['हमल', 'हए', 'हम', 'करत', 'वयकत'],
 ['रपय', 'गई', 'रपए', 'अनगरह', 'दन'],
 ['बहत', 'मझ', 'हम', 'दश', 'आपक'],
 ['in', 'http', 'अपन', 'of', 'to'],
 ['तहत', 'वरष', 'करन', 'इसस', 'पद'],
 ['आज', 'हए', 'नरनदर', 'करत', 'अवसर'],
 ['कषतर', 'एव', 'एमओय', 'पतर', 'परयटन'],
 ['दश', 'रह', 'आज', 'अमर', 'हम'],
 ['मझ', 'बहत', 'मन', 'पहल', 'आपक'],
 ['रल', 'परदश', 'चरण', 'एकड', 'आधर'],
 ['उनक', 'उठ', 'उच', 'तपस', 'टस'],
 ['अवसर', 'सदश', 'नमन', 'अपन', 'गर'],
 ['सफल', 'परकषपण', 'उठ', 'उच', 'बढग'],
 ['आज', 'हम', 'करत', 'अवसर', 'अपन'],
 ['बठक', 'सयकत', 'सममलन', 'नरनदर', 'हई'],
 ['करन', 'मदद', 'पश', 'दन', 'इसक'],
 ['मझ', 'आपक', 'मर', 'excellency', 'मरकल']]

In [67]:
# See topic predictions per speech doc
z_ctm_50_HI.get_topic_lists(5)

[['पतर', 'एमओय', 'कषतर', 'उपकरण', 'एव'],
 ['दश', 'रह', 'मर', 'आज', 'जय'],
 ['दश', 'बहत', 'आज', 'हम', 'कछ'],
 ['तहत', 'वरष', 'करन', 'वतन', 'इसक'],
 ['अपन', 'फसबक', 'सममलन', 'mygov', 'http'],
 ['मझ', 'मर', 'media', 'economic', 'excellency'],
 ['हए', 'नरनदर', 'करत', 'अवसर', 'रप'],
 ['बहत', 'मझ', 'आज', 'मर', 'दश'],
 ['नरनदर', 'शख', 'उनह', 'अबदल', 'अल'],
 ['रपय', 'गई', 'तहत', 'एकड', 'इसक'],
 ['करत', 'हम', 'नमन', 'शत', 'अवसर'],
 ['बहत', 'मझ', 'हम', 'दश', 'रप'],
 ['बठक', 'गई', 'करन', 'सख', 'हई'],
 ['हए', 'करत', 'आज', 'नरनदर', 'अवसर'],
 ['अवसर', 'हम', 'खल', 'अपन', 'सदश'],
 ['मर', 'कदर', 'ससद', 'एन', 'मझ'],
 ['बहत', 'आज', 'दश', 'हम', 'हए'],
 ['कषतर', 'मदद', 'इसस', 'एव', 'पतर'],
 ['of', 'the', 'and', 'to', 'in'],
 ['आज', 'हए', 'करत', 'अवसर', 'नई'],
 ['करन', 'मदद', 'इसस', 'आय', 'घरल'],
 ['करन', 'गई', 'तहत', 'रपय', 'दन'],
 ['सदश', 'सफल', 'परकषपण', 'अपन', 'गरव'],
 ['मन', 'मझ', 'कछ', 'हआ', 'पहल'],
 ['मध', 'उजबक', 'तपस', 'जलम', 'अधर'],
 ['कषतर', 'एव', 'एमओय', 'जल', 'परबधन'],
 ['हई', 'बस', 'गए', 'दख',

###**English-Based ZeroshotTM**

*Preprocessing*

Why do we use the preprocessed text here? We need text without punctuation to build the bag of word. Also, we might want only to have the most frequent words inside the BoW. Too many words might not help.

In [73]:
### ENGLISH ###
LANG_SELECTED = 'en'

# We select 500 tokens per speech
NUM_TOKENS = 500 

# Download English Stopwords
nltk.download('stopwords')

# Run preprocessing script
documents = [line[:NUM_TOKENS].strip() for line in train_speeches[LANG_SELECTED]]
sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list = stopwords.stopwords(LANG_SELECTED))
preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()

# Ensure same length for preprocessed and unpreprocessed
print(len(preprocessed_documents), len(unpreprocessed_corpus))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
4489 4489




We don't discard the non-preprocessed english texts, because we are going to use them as input for obtaining the **contextualized** document representations. We will now pass our files with preprocess and unpreprocessed data to our TopicModelDataPreparation object. This object takes care of creating the bag of words and obtains the contextualized BERT representations of documents. This operation allows us to create our training dataset.

Note: Here we use the contextualized model "ai4bharat/indic-bert", because we need a multilingual model for indic languages for performing cross-lingual predictions later.



In [74]:
# Building training dataset
tp = TopicModelDataPreparation("ai4bharat/indic-bert")
en_training = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/723 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/402 [00:00<?, ?B/s]

Batches:   0%|          | 0/23 [00:00<?, ?it/s]



*Training ZeroshotTM*

In [None]:
# Train over 100 epochs

### ENGLISH : 25 TOPICS ###
z_ctm_25_EN = ZeroShotTM(bow_size=len(tp.vocab), n_components = 25, contextual_size=768, num_epochs=100)
z_ctm_25_EN.fit(en_training) # run the model
z_ctm_25_EN.save("./") # save the model

### ENGLISH : 50 TOPICS ###
z_ctm_50_EN = ZeroShotTM(bow_size=len(tp.vocab), n_components = 50, contextual_size=768, num_epochs=100)
z_ctm_50_EN.fit(en_training) # run the model
z_ctm_50_EN.save("./") # save the model

In [76]:
# See topic predictions per speech doc
z_ctm_25_EN.get_topic_lists(5)

[['excellency', 'media', 'delighted', 'distinguished', 'friends'],
 ['day', 'indian', 'congratulated', 'salute', 'winning'],
 ['prime', 'minister', 'modi', 'called', 'india'],
 ['मर', 'पर', 'और', 'मच', 'जय'],
 ['scientists', 'satellite', 'bharat', 'isro', 'launch'],
 ['हम', 'एक', 'और', 'आज', 'बहत'],
 ['और', 'आज', 'एक', 'बहत', 'पर'],
 ['people', 'day', 'greeted', 'wishes', 'occasion'],
 ['और', 'पर', 'मर', 'हम', 'आप'],
 ['mou', 'cooperation', 'understanding', 'memorandum', 'approval'],
 ['budget', 'time', 'namaskar', 'countrymen', 'baat'],
 ['mou', 'cooperation', 'understanding', 'union', 'memorandum'],
 ['cabinet', 'land', 'union', 'chaired', 'approval'],
 ['shri', 'meeting', 'chief', 'minister', 'government'],
 ['पर', 'हम', 'और', 'एक', 'इस'],
 ['cabinet', 'central', 'union', 'chaired', 'approval'],
 ['governance', 'interaction', 'based', 'progress', 'implementation'],
 ['yojana', 'development', 'scheme', 'crore', 'pradhan'],
 ['india', 'summit', 'visit', 'president', 'japan'],
 ['lives

In [77]:
# See topic predictions per speech doc
z_ctm_50_EN.get_topic_lists(5)

[['land', 'cabinet', 'union', 'acres', 'institutes'],
 ['minister', 'prime', 'modi', 'nepal', 'narendra'],
 ['आप', 'आज', 'पर', 'इस', 'रह'],
 ['radio', 'share', 'programme', 'invited', 'mygov'],
 ['day', 'salute', 'saluted', 'personnel', 'armed'],
 ['condoled', 'life', 'shri', 'passing', 'demise'],
 ['called', 'interacted', 'secretaries', 'development', 'delegation'],
 ['people', 'greeted', 'day', 'occasion', 'auspicious'],
 ['और', 'आज', 'पर', 'एक', 'रप'],
 ['crore', 'rs', 'project', 'cost', 'approved'],
 ['progress', 'infrastructure', 'projects', 'sectors', 'reviewed'],
 ['cabinet', 'union', 'chaired', 'approved', 'crore'],
 ['minister', 'prime', 'modi', 'called', 'bilateral'],
 ['central', 'pay', 'cabinet', 'approved', 'chaired'],
 ['ससद', 'और', 'कदर', 'सदन', 'परदश'],
 ['lives', 'loss', 'expressed', 'families', 'injured'],
 ['occasion', 'book', 'speaking', 'shri', 'released'],
 ['union', 'agreement', 'chaired', 'cabinet', 'approved'],
 ['demise', 'condoled', 'passing', 'family', 'cond

## Predictions and Evaluation
###**Unseen Multilingual  Corpora Predictions**

*Languages*

* Assamese - as
* Bengali - bn
* English - en
* Gujarati - gu
* Hindi - hi
* Kannada - kn
* Malayalam - ml
* Marathi - mr
* Oriya - or
* Punjabi - pa
* Tamil - ta
* Telugu - te

In [None]:
# Convert test files into test datasets
as_testset = tp.transform(parallel_speeches['as'])
bn_testset = tp.transform(parallel_speeches['bn'])
en_testset = tp.transform(parallel_speeches['en'])
gu_testset = tp.transform(parallel_speeches['gu'])
hi_testset = tp.transform(parallel_speeches['hi'])
kn_testset = tp.transform(parallel_speeches['kn'])
ml_testset = tp.transform(parallel_speeches['ml'])
mr_testset = tp.transform(parallel_speeches['mr'])
or_testset = tp.transform(parallel_speeches['or'])
pa_testset = tp.transform(parallel_speeches['pa'])
ta_testset = tp.transform(parallel_speeches['ta'])
te_testset = tp.transform(parallel_speeches['te'])

###**Topic Predictions**

*Hindi*

In [None]:
### HINDI : 25 TOPIC PREDICTIONS ### 
as_topics_predictions = z_ctm_25_HI.get_thetas(as_testset, n_samples=100) # get all the topic predictions
bn_topics_predictions = z_ctm_25_HI.get_thetas(bn_testset, n_samples=100) # get all the topic predictions
en_topics_predictions = z_ctm_25_HI.get_thetas(en_testset, n_samples=100) # get all the topic predictions
gu_topics_predictions = z_ctm_25_HI.get_thetas(gu_testset, n_samples=100) # get all the topic predictions
hi_topics_predictions = z_ctm_25_HI.get_thetas(hi_testset, n_samples=100) # get all the topic predictions
kn_topics_predictions = z_ctm_25_HI.get_thetas(kn_testset, n_samples=100) # get all the topic predictions
ml_topics_predictions = z_ctm_25_HI.get_thetas(ml_testset, n_samples=100) # get all the topic predictions
mr_topics_predictions = z_ctm_25_HI.get_thetas(mr_testset, n_samples=100) # get all the topic predictions
or_topics_predictions = z_ctm_25_HI.get_thetas(or_testset, n_samples=100) # get all the topic predictions
pa_topics_predictions = z_ctm_25_HI.get_thetas(pa_testset, n_samples=100) # get all the topic predictions
ta_topics_predictions = z_ctm_25_HI.get_thetas(ta_testset, n_samples=100) # get all the topic predictions
te_topics_predictions = z_ctm_25_HI.get_thetas(te_testset, n_samples=100) # get all the topic predictions

topics_25_HI = {'as': as_topics_predictions, 'bn': bn_topics_predictions, 
             'en': en_topics_predictions, 'gu': gu_topics_predictions,
             'hi': hi_topics_predictions, 'kn': kn_topics_predictions,
             'ml': ml_topics_predictions, 'mr': mr_topics_predictions,
             'or': or_topics_predictions, 'pa': pa_topics_predictions,
             'ta': ta_topics_predictions, 'te': te_topics_predictions}

In [None]:
### HINDI : 50 TOPIC PREDICTIONS ### 
as_topics_predictions = z_ctm_50_HI.get_thetas(as_testset, n_samples=100) # get all the topic predictions
bn_topics_predictions = z_ctm_50_HI.get_thetas(bn_testset, n_samples=100) # get all the topic predictions
en_topics_predictions = z_ctm_50_HI.get_thetas(en_testset, n_samples=100) # get all the topic predictions
gu_topics_predictions = z_ctm_50_HI.get_thetas(gu_testset, n_samples=100) # get all the topic predictions
hi_topics_predictions = z_ctm_50_HI.get_thetas(hi_testset, n_samples=100) # get all the topic predictions
kn_topics_predictions = z_ctm_50_HI.get_thetas(kn_testset, n_samples=100) # get all the topic predictions
ml_topics_predictions = z_ctm_50_HI.get_thetas(ml_testset, n_samples=100) # get all the topic predictions
mr_topics_predictions = z_ctm_50_HI.get_thetas(mr_testset, n_samples=100) # get all the topic predictions
or_topics_predictions = z_ctm_50_HI.get_thetas(or_testset, n_samples=100) # get all the topic predictions
pa_topics_predictions = z_ctm_50_HI.get_thetas(pa_testset, n_samples=100) # get all the topic predictions
ta_topics_predictions = z_ctm_50_HI.get_thetas(ta_testset, n_samples=100) # get all the topic predictions
te_topics_predictions = z_ctm_50_HI.get_thetas(te_testset, n_samples=100) # get all the topic predictions

topics_50_HI = {'as': as_topics_predictions, 'bn': bn_topics_predictions, 
             'en': en_topics_predictions, 'gu': gu_topics_predictions,
             'hi': hi_topics_predictions, 'kn': kn_topics_predictions,
             'ml': ml_topics_predictions, 'mr': mr_topics_predictions,
             'or': or_topics_predictions, 'pa': pa_topics_predictions,
             'ta': ta_topics_predictions, 'te': te_topics_predictions}

*English*

In [None]:
### ENGLISH : 25 TOPIC PREDICTIONS ### 
as_topics_predictions = z_ctm_25_EN.get_thetas(as_testset, n_samples=100) # get all the topic predictions
bn_topics_predictions = z_ctm_25_EN.get_thetas(bn_testset, n_samples=100) # get all the topic predictions
en_topics_predictions = z_ctm_25_EN.get_thetas(en_testset, n_samples=100) # get all the topic predictions
gu_topics_predictions = z_ctm_25_EN.get_thetas(gu_testset, n_samples=100) # get all the topic predictions
hi_topics_predictions = z_ctm_25_EN.get_thetas(hi_testset, n_samples=100) # get all the topic predictions
kn_topics_predictions = z_ctm_25_EN.get_thetas(kn_testset, n_samples=100) # get all the topic predictions
ml_topics_predictions = z_ctm_25_EN.get_thetas(ml_testset, n_samples=100) # get all the topic predictions
mr_topics_predictions = z_ctm_25_EN.get_thetas(mr_testset, n_samples=100) # get all the topic predictions
or_topics_predictions = z_ctm_25_EN.get_thetas(or_testset, n_samples=100) # get all the topic predictions
pa_topics_predictions = z_ctm_25_EN.get_thetas(pa_testset, n_samples=100) # get all the topic predictions
ta_topics_predictions = z_ctm_25_EN.get_thetas(ta_testset, n_samples=100) # get all the topic predictions
te_topics_predictions = z_ctm_25_EN.get_thetas(te_testset, n_samples=100) # get all the topic predictions

topics_25_EN = {'as': as_topics_predictions, 'bn': bn_topics_predictions, 
             'en': en_topics_predictions, 'gu': gu_topics_predictions,
             'hi': hi_topics_predictions, 'kn': kn_topics_predictions,
             'ml': ml_topics_predictions, 'mr': mr_topics_predictions,
             'or': or_topics_predictions, 'pa': pa_topics_predictions,
             'ta': ta_topics_predictions, 'te': te_topics_predictions}

In [None]:
### ENGLISH : 50 TOPIC PREDICTIONS ### 
as_topics_predictions = z_ctm_50_EN.get_thetas(as_testset, n_samples=100) # get all the topic predictions
bn_topics_predictions = z_ctm_50_EN.get_thetas(bn_testset, n_samples=100) # get all the topic predictions
en_topics_predictions = z_ctm_50_EN.get_thetas(en_testset, n_samples=100) # get all the topic predictions
gu_topics_predictions = z_ctm_50_EN.get_thetas(gu_testset, n_samples=100) # get all the topic predictions
hi_topics_predictions = z_ctm_50_EN.get_thetas(hi_testset, n_samples=100) # get all the topic predictions
kn_topics_predictions = z_ctm_50_EN.get_thetas(kn_testset, n_samples=100) # get all the topic predictions
ml_topics_predictions = z_ctm_50_EN.get_thetas(ml_testset, n_samples=100) # get all the topic predictions
mr_topics_predictions = z_ctm_50_EN.get_thetas(mr_testset, n_samples=100) # get all the topic predictions
or_topics_predictions = z_ctm_50_EN.get_thetas(or_testset, n_samples=100) # get all the topic predictions
pa_topics_predictions = z_ctm_50_EN.get_thetas(pa_testset, n_samples=100) # get all the topic predictions
ta_topics_predictions = z_ctm_50_EN.get_thetas(ta_testset, n_samples=100) # get all the topic predictions
te_topics_predictions = z_ctm_50_EN.get_thetas(te_testset, n_samples=100) # get all the topic predictions

topics_50_EN = {'as': as_topics_predictions, 'bn': bn_topics_predictions, 
             'en': en_topics_predictions, 'gu': gu_topics_predictions,
             'hi': hi_topics_predictions, 'kn': kn_topics_predictions,
             'ml': ml_topics_predictions, 'mr': mr_topics_predictions,
             'or': or_topics_predictions, 'pa': pa_topics_predictions,
             'ta': ta_topics_predictions, 'te': te_topics_predictions}

**Quantitative Evaluation**

In [None]:
# Import metrics
from contextualized_topic_models.evaluation.measures import Matches, KLDivergence, CentroidDistance
import warnings
warnings.filterwarnings('ignore')

1. **Matches**

> Matches is the % of times the predicted topic for the non-English test document is the same as for the respective test document in English. The higher the scores, the better.

*Hindi*

In [84]:
# HINDI : Matches for 25 topics
hi_as_matches = Matches(topics_25_HI['hi'], topics_25_HI['as'])
hi_bn_matches = Matches(topics_25_HI['hi'], topics_25_HI['bn'])
hi_en_matches = Matches(topics_25_HI['hi'], topics_25_HI['en'])
hi_gu_matches = Matches(topics_25_HI['hi'], topics_25_HI['gu'])
hi_kn_matches = Matches(topics_25_HI['hi'], topics_25_HI['kn'])
hi_ml_matches = Matches(topics_25_HI['hi'], topics_25_HI['ml'])
hi_mr_matches = Matches(topics_25_HI['hi'], topics_25_HI['mr'])
hi_or_matches = Matches(topics_25_HI['hi'], topics_25_HI['or'])
hi_pa_matches = Matches(topics_25_HI['hi'], topics_25_HI['pa'])
hi_ta_matches = Matches(topics_25_HI['hi'], topics_25_HI['ta'])
hi_te_matches = Matches(topics_25_HI['hi'], topics_25_HI['te'])


matches_25_HI = {'as': hi_as_matches.score(), 'bn': hi_bn_matches.score(), 
             'en': hi_en_matches.score(), 'gu': hi_gu_matches.score(),
             'kn': hi_kn_matches.score(),
             'ml': hi_ml_matches.score(), 'mr': hi_mr_matches.score(),
             'or': hi_or_matches.score(), 'pa': hi_pa_matches.score(),
             'ta': hi_ta_matches.score(), 'te': hi_te_matches.score()}
matches_25_HI

{'as': 0.7875,
 'bn': 0.7725,
 'en': 0.78625,
 'gu': 0.79125,
 'kn': 0.72125,
 'ml': 0.655,
 'mr': 0.8425,
 'or': 0.58375,
 'pa': 0.77125,
 'ta': 0.70125,
 'te': 0.7925}

In [85]:
# HINDI : Matches for 50 topics
hi_as_matches = Matches(topics_50_HI['hi'], topics_50_HI['as'])
hi_bn_matches = Matches(topics_50_HI['hi'], topics_50_HI['bn'])
hi_en_matches = Matches(topics_50_HI['hi'], topics_50_HI['en'])
hi_gu_matches = Matches(topics_50_HI['hi'], topics_50_HI['gu'])
hi_kn_matches = Matches(topics_50_HI['hi'], topics_50_HI['kn'])
hi_ml_matches = Matches(topics_50_HI['hi'], topics_50_HI['ml'])
hi_mr_matches = Matches(topics_50_HI['hi'], topics_50_HI['mr'])
hi_or_matches = Matches(topics_50_HI['hi'], topics_50_HI['or'])
hi_pa_matches = Matches(topics_50_HI['hi'], topics_50_HI['pa'])
hi_ta_matches = Matches(topics_50_HI['hi'], topics_50_HI['ta'])
hi_te_matches = Matches(topics_50_HI['hi'], topics_50_HI['te'])


matches_50_HI = {'as': hi_as_matches.score(), 'bn': hi_bn_matches.score(), 
             'en': hi_en_matches.score(), 'gu': hi_gu_matches.score(),
             'kn': hi_kn_matches.score(),
             'ml': hi_ml_matches.score(), 'mr': hi_mr_matches.score(),
             'or': hi_or_matches.score(), 'pa': hi_pa_matches.score(),
             'ta': hi_ta_matches.score(), 'te': hi_te_matches.score()}
matches_50_HI

{'as': 0.33625,
 'bn': 0.56875,
 'en': 0.61,
 'gu': 0.6625,
 'kn': 0.59125,
 'ml': 0.58625,
 'mr': 0.67,
 'or': 0.6075,
 'pa': 0.61375,
 'ta': 0.55125,
 'te': 0.4125}

*English*

In [86]:
# ENGLISH : Matches for 25 topics
en_as_matches = Matches(topics_25_EN['en'], topics_25_EN['as'])
en_bn_matches = Matches(topics_25_EN['en'], topics_25_EN['bn'])
en_hi_matches = Matches(topics_25_EN['en'], topics_25_EN['hi'])
en_gu_matches = Matches(topics_25_EN['en'], topics_25_EN['gu'])
en_kn_matches = Matches(topics_25_EN['en'], topics_25_EN['kn'])
en_ml_matches = Matches(topics_25_EN['en'], topics_25_EN['ml'])
en_mr_matches = Matches(topics_25_EN['en'], topics_25_EN['mr'])
en_or_matches = Matches(topics_25_EN['en'], topics_25_EN['or'])
en_pa_matches = Matches(topics_25_EN['en'], topics_25_EN['pa'])
en_ta_matches = Matches(topics_25_EN['en'], topics_25_EN['ta'])
en_te_matches = Matches(topics_25_EN['en'], topics_25_EN['te'])


matches_25_EN = {'as': en_as_matches.score(), 'bn': en_bn_matches.score(), 
             'en': en_hi_matches.score(), 'gu': en_gu_matches.score(),
             'kn': en_kn_matches.score(),
             'ml': en_ml_matches.score(), 'mr': en_mr_matches.score(),
             'or': en_or_matches.score(), 'pa': en_pa_matches.score(),
             'ta': en_ta_matches.score(), 'te': en_te_matches.score()}
matches_25_EN

{'as': 0.0825,
 'bn': 0.30875,
 'en': 0.66625,
 'gu': 0.5725,
 'kn': 0.365,
 'ml': 0.33875,
 'mr': 0.61125,
 'or': 0.20625,
 'pa': 0.4275,
 'ta': 0.41375,
 'te': 0.2475}

In [87]:
# ENGLISH : Matches for 50 topics
en_as_matches = Matches(topics_50_EN['en'], topics_50_EN['as'])
en_bn_matches = Matches(topics_50_EN['en'], topics_50_EN['bn'])
en_en_matches = Matches(topics_50_EN['en'], topics_50_EN['hi'])
en_gu_matches = Matches(topics_50_EN['en'], topics_50_EN['gu'])
en_kn_matches = Matches(topics_50_EN['en'], topics_50_EN['kn'])
en_ml_matches = Matches(topics_50_EN['en'], topics_50_EN['ml'])
en_mr_matches = Matches(topics_50_EN['en'], topics_50_EN['mr'])
en_or_matches = Matches(topics_50_EN['en'], topics_50_EN['or'])
en_pa_matches = Matches(topics_50_EN['en'], topics_50_EN['pa'])
en_ta_matches = Matches(topics_50_EN['en'], topics_50_EN['ta'])
en_te_matches = Matches(topics_50_EN['en'], topics_50_EN['te'])


matches_50_EN = {'as': en_as_matches.score(), 'bn': en_bn_matches.score(), 
             'en': en_hi_matches.score(), 'gu': en_gu_matches.score(),
             'kn': en_kn_matches.score(),
             'ml': en_ml_matches.score(), 'mr': en_mr_matches.score(),
             'or': en_or_matches.score(), 'pa': en_pa_matches.score(),
             'ta': en_ta_matches.score(), 'te': en_te_matches.score()}
matches_50_EN

{'as': 0.03625,
 'bn': 0.2225,
 'en': 0.66625,
 'gu': 0.4875,
 'kn': 0.31375,
 'ml': 0.29625,
 'mr': 0.53125,
 'or': 0.10625,
 'pa': 0.3825,
 'ta': 0.36375,
 'te': 0.21625}

2. **Distributional Similarity**
> Compute the KL divergence between the predicted topic distribution on the test document and the same test document in English. Lower scores are better, indicating that the distributions do not differ by much.

*Hindi*

In [88]:
# HINDI : KL Divergence for 25 topics
hi_as_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['as'])
hi_bn_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['bn'])
hi_en_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['en'])
hi_gu_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['gu'])
hi_kn_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['kn'])
hi_ml_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['ml'])
hi_mr_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['mr'])
hi_or_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['or'])
hi_pa_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['pa'])
hi_ta_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['ta'])
hi_te_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['te'])

kl_divergence_25 = {'as': hi_as_kl.score(), 'bn': hi_bn_kl.score(), 
             'en': hi_en_kl.score(), 'gu': hi_gu_kl.score(),
             'kn': hi_kn_kl.score(),
             'ml': hi_ml_kl.score(), 'mr': hi_mr_kl.score(),
             'or': hi_or_kl.score(), 'pa': hi_pa_kl.score(),
             'ta': hi_ta_kl.score(), 'te': hi_te_kl.score()}

kl_divergence_25

{'as': 0.21418631882747846,
 'bn': 0.12554456053699053,
 'en': 0.13213215643888646,
 'gu': 0.10491287285979663,
 'kn': 0.2057592232509147,
 'ml': 0.25233703837194676,
 'mr': 0.07890429517642374,
 'or': 0.2163793272930214,
 'pa': 0.14377184856990705,
 'ta': 0.18361980804018363,
 'te': 0.15718247530481724}

In [89]:
# HINDI : KL Divergence for 50 topics
hi_as_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['as'])
hi_bn_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['bn'])
hi_en_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['en'])
hi_gu_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['gu'])
hi_kn_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['kn'])
hi_ml_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['ml'])
hi_mr_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['mr'])
hi_or_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['or'])
hi_pa_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['pa'])
hi_ta_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['ta'])
hi_te_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['te'])

kl_divergence_50 = {'as': hi_as_kl.score(), 'bn': hi_bn_kl.score(), 
             'en': hi_en_kl.score(), 'gu': hi_gu_kl.score(),
             'kn': hi_kn_kl.score(),
             'ml': hi_ml_kl.score(), 'mr': hi_mr_kl.score(),
             'or': hi_or_kl.score(), 'pa': hi_pa_kl.score(),
             'ta': hi_ta_kl.score(), 'te': hi_te_kl.score()}

kl_divergence_50

{'as': 0.33664856132071214,
 'bn': 0.16565056286805657,
 'en': 0.14263255339547315,
 'gu': 0.12946210087417628,
 'kn': 0.1891453908052101,
 'ml': 0.21693590273388097,
 'mr': 0.09794668646263932,
 'or': 0.40150010351315507,
 'pa': 0.2065258857447768,
 'ta': 0.20984083258929032,
 'te': 0.25220701287524033}

*English*

In [90]:
# ENGLISH : KL Divergence for 25 topics
en_as_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['as'])
en_bn_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['bn'])
en_hi_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['hi'])
en_gu_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['gu'])
en_kn_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['kn'])
en_ml_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['ml'])
en_mr_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['mr'])
en_or_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['or'])
en_pa_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['pa'])
en_ta_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['ta'])
en_te_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['te'])

kl_divergence_25_EN = {'as': en_as_kl.score(), 'bn': en_bn_kl.score(), 
             'hi': en_hi_kl.score(), 'gu': en_gu_kl.score(),
             'kn': en_kn_kl.score(),
             'ml': en_ml_kl.score(), 'mr': en_mr_kl.score(),
             'or': en_or_kl.score(), 'pa': en_pa_kl.score(),
             'ta': en_ta_kl.score(), 'te': en_te_kl.score()}

kl_divergence_25_EN

{'as': 1.0313274518450293,
 'bn': 0.6706951475556343,
 'gu': 0.37721976672663243,
 'hi': 0.233475236024007,
 'kn': 0.607528446829308,
 'ml': 0.6483922216671271,
 'mr': 0.2910209284263246,
 'or': 0.8899447242015774,
 'pa': 0.5282167309451878,
 'ta': 0.5242024025075205,
 'te': 0.7776165519251892}

In [91]:
# ENGLISH : KL Divergence for 50 topics
en_as_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['as'])
en_bn_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['bn'])
en_hi_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['hi'])
en_gu_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['gu'])
en_kn_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['kn'])
en_ml_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['ml'])
en_mr_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['mr'])
en_or_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['or'])
en_pa_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['pa'])
en_ta_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['ta'])
en_te_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['te'])

kl_divergence_50_EN = {'as': en_as_kl.score(), 'bn': en_bn_kl.score(), 
             'hi': en_hi_kl.score(), 'gu': en_gu_kl.score(),
             'kn': en_kn_kl.score(),
             'ml': en_ml_kl.score(), 'mr': en_mr_kl.score(),
             'or': en_or_kl.score(), 'pa': en_pa_kl.score(),
             'ta': en_ta_kl.score(), 'te': en_te_kl.score()}

kl_divergence_50_EN

{'as': 1.073360061410766,
 'bn': 0.7502772156923169,
 'gu': 0.4397040125318557,
 'hi': 0.2801557912564922,
 'kn': 0.6816490982442756,
 'ml': 0.7169776503501152,
 'mr': 0.3532931141486461,
 'or': 0.9813963448325722,
 'pa': 0.5775319888935541,
 'ta': 0.5880631427640578,
 'te': 0.8074590934014203}

3. **Centroid Embeddings**
> To also account for similar but not exactly equal topic predictions, we compute the centroid embeddings of the 5 words describing the predicted topic for both English and non-English documents. Then we compute the cosine similarity between those two centroids (CD).

In [92]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import KeyedVectors
import gensim.downloader as api
from scipy.spatial.distance import cosine
import abc

class CD(CentroidDistance):
    """Override author's function to upgrade compatibility with Gensim 4.0.0.
    See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4."""

    def get_centroid(self, word_list):
        vector_list = []
        for word in word_list:
            if word in self.wv:   # changed from self.wv.vocab to self.wv as in Gensim 4.0.0
                vector_list.append(self.wv.get_vector(word))
        vec = sum(vector_list)
        return vec / np.linalg.norm(vec)

*Hindi*

In [94]:
# HINDI : Centroid Embeddings for 25 topics
cd_25_HI = {}

for key in topics_25_HI.keys():
  if key == 'hi':
    continue
  topic = topics_25_HI[key]
  cd = CD(doc_distribution_original_language = topics_25_HI['hi'], 
          doc_distribution_unseen_language = topic, 
          topics = z_ctm_25_HI.get_topic_lists(25),
          topk = 5)
  
  cd_25_HI[key] = cd.score()

cd_25_HI



{'as': nan,
 'bn': nan,
 'en': nan,
 'gu': nan,
 'kn': nan,
 'ml': nan,
 'mr': nan,
 'or': nan,
 'pa': nan,
 'ta': nan,
 'te': nan}

In [None]:
# HINDI : Centroid Embeddings for 50 topics
cd_50_HI = {}

for key in topics_50_HI.keys():
  if key == 'hi':
    continue
  topic = topics_50_HI[key]
  cd = CD(doc_distribution_original_language = topics_50_HI['hi'], 
          doc_distribution_unseen_language = topic, 
          topics = z_ctm_50_HI.get_topic_lists(50),
          topk = 5)
  
  cd_50_HI[key] = cd.score()
  cd = None

cd_50_HI

*English*

In [None]:
# ENGLISH : Centroid Embeddings for 25 topics
cd_25_EN = {}

for key in topics_25_EN.keys():
  if key == 'en':
    continue
  topic = topics_25_EN[key]
  cd = CD(doc_distribution_original_language = topics_25_EN['en'], 
          doc_distribution_unseen_language = topic, 
          topics = z_ctm_25_EN.get_topic_lists(25),
          topk = 5)
  
  cd_25_EN[key] = cd.score()

cd_25_EN

In [None]:
# ENGLISH : Centroid Embeddings for 50 topics
cd_50_EN = {}

for key in topics_50_EN.keys():
  if key == 'hi':
    continue
  topic = topics_50_EN[key]
  cd = CD(doc_distribution_original_language = topics_50_EN['en'], 
          doc_distribution_unseen_language = topic, 
          topics = z_ctm_50_EN.get_topic_lists(50),
          topk = 5)
  
  cd_50_EN[key] = cd.score()
  cd = None

cd_50_EN

In [None]:
# Store Metrics
metrics = {
          "Hindi" : [{
                    "Mat25": matches_25_HI,
                    "KL25": kl_divergence_25_HI, 
                    "CD25": cd_25_HI, 
                    "Mat50": matches_50_HI, 
                    "KL50": kl_divergence_50_HI,
                    "CD50": cd_50_HI
                    }],

          "English": [{
                    "Mat25": matches_25_EN,
                    "KL25": kl_divergence_25_EN, 
                    "CD25": cd_25_EN, 
                    "Mat50": matches_50_EN, 
                    "KL50": kl_divergence_50_EN,
                    "CD50": cd_50_EN
                    }]
          }
with open("metrics_samescript.txt", 'wb') as F:
  pickle.dump(metrics, F)

metrics