<a href="https://colab.research.google.com/github/DivyaRustagi10/contextualized-topic-models-ssl/blob/main/ZeroshotTM_For_Same_Script_Languages.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Zero-shot Cross-Lingual Topic Modeling For Same Script Languages
> Do conxtextualized TM tackle zero-shot cross-lingual topic modeling better on same script languages?

We use 4000 documents as training and consider randomly sampled 800 documents as the test set. We collect the 800 respective instances in [LANGUAGES].

First, we use IndicBERT to generate multilingual embeddings as the input of the model. Then we evaluate multilingual topic predictions on the multilingual abstracts in test set.

In [1]:
# Install the contextualized topic model library
%%capture
!pip install -U contextualized_topic_models

In [2]:
%%capture
!pip install pyldavis
!pip install wget
!pip install head
!nvidia-smi

# Setup Hindi for analysis
!pip install indic-nlp-library==0.81
!pip install stopwordsiso
!pip install torch==1.3.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
!pip install inltk

In [3]:
# Imports
from contextualized_topic_models.models.ctm import ZeroShotTM
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import  WhiteSpacePreprocessingStopwords
import pickle

In [4]:
from inltk.inltk import setup

try:
  setup('hi')
except:
  pass

## Data

###**Building PMIndia Corpus**

Below contains the code for creating a parallel corpus from the website of the Indian Prime Minister (www.pmindia.gov.in). 

We combine each speech document into one, for every language. Datasets are downloaded from [Statistical Machine Translation](https://data.statmt.org/pmindia/v1/monolingual/).

*Downloading Dataset*

In [5]:
# Imports
import urllib
import wget
import pandas as pd
import os

# Download PMIndia Datasets
FILES_DIR = os.getcwd() # REPLACE WITH YOUR DIRECTORY IF YOU PREFER DOWNLOADING IN SPECIFIC DIRECTORY
LINK = "https://drive.google.com/u/0/uc?id=1IqH2XQFw1XHPT2Sh_Yz3LnrMEVqS8oef&export=download"  # LINK TO PMINDIA FILE LOCATION
FILE_PATH = os.getcwd() + "/" + wget.download(LINK) # PATH TO DOWNLOADS

pmindia_list = pd.read_csv(FILE_PATH, sep = ",",  names = ["lang", "link"], engine = "python" )['link']
pmindia_list = [FILES_DIR + "/" + wget.download(link.strip(" ")) for link in pmindia_list]

In [6]:
# Build directory to store speech files
%%bash
mkdir parallel_speeches
cd parallel_speeches

In [7]:
# Files stored in content directory
# Data will need to be re-downloaded if a session closes
''' Following script will download parallel corpus into new directory named parallel_speeches.
Each folder in parallel_speeches contains pmindia speeches in indic language identified by their ISO code.

For e.g.
parallel_speeches/as/ contains speeches in Assamese.
parallel_speeches/hi contains speeches in Hindi.
'''
import tarfile
import os

FILES_DIR = os.getcwd() # REPLACE WITH DIRECTORY OF FILES
STORE_FILES_HERE = os.getcwd() + "/parallel_speeches" # STORE EXTRACTED FILES HERE

for fname in pmindia_list:
    tar = tarfile.open(fname, "r:gz") # unzip file
    tar.extractall(STORE_FILES_HERE)    
    foldername = fname[fname.rfind('.tgz')-2: fname.rfind('.tgz')] # get folder name
    os.rename(STORE_FILES_HERE + "/split", STORE_FILES_HERE + "/" + foldername) # rename default split folder to language name
    tar.close()

*Get Parallel Speeches*

In [8]:
import glob # SEEK FILES FROM ABOVE FOR DOWNLOAD
FOLDERS_DIR = os.getcwd() + "/parallel_speeches"  # DIRECTORY FOR SAVING PARALLEL SPEECHES

# Stores list of speeches in dictionary keyed by ISO language name
SPEECHES_IN_LANGS = {languagefolder[-2:] : sorted(list(glob.glob(languagefolder + "/*.txt")))[3:] for languagefolder in glob.glob(FOLDERS_DIR + "/*")}

Downloading Model. This might take time, depending on your internet connection. Please be patient.
We'll only do this for the first time.


In [9]:
# Get file names to find common test corpus (set of files common between all languages)
STRIP_INDEX = SPEECHES_IN_LANGS['as'][0].index('as') + len('as/')   # GET LANGUAGE ISO CODE

filenames = {}
for lang in SPEECHES_IN_LANGS.keys():
  temp = [item[STRIP_INDEX:] for item in SPEECHES_IN_LANGS[lang]]
  filenames[lang] = temp

*Combine Parallel Speeches Into Corpus*

In [10]:
# Setting seed for reproducibility
import random
random.seed(210)

# Select 800 random corpus
sample_corpus = sorted(random.sample(list(set.intersection(*map(set, filenames.values()))), 800))
pd.DataFrame(sample_corpus, columns = ["Speech File Name"])

Unnamed: 0,Speech File Name
0,15th-edition-of-pravasi-bharatiya-diwas-inaugu...
1,ambassador-ms-nikki-haley-united-states-perman...
2,anganwadi-workers-from-across-the-country-call...
3,asha-representatives-from-across-the-country-c...
4,beneficiaries-of-pradhan-mantri-mudra-yojana-s...
...,...
795,the-budget-for-new-india-will-energise-the-nat...
796,uk-india-joint-statement-during-pms-visit-to-u...
797,up-cm-donates-rs-five-crore-towards-pmnrf.txt
798,us-secretary-of-state-rex-w-tillerson-calls-on...


In [11]:
# Combine selected 800 sample speeches into one document for each lang 
parallel_speeches = {}    # STORE TEST SPEECHES
train_speeches = {}       # STORE TRAIN SPEECHES

for lang, speeches in SPEECHES_IN_LANGS.items(): 
  each_lang = []         # list of speeches for one language at a time
  just_train = []
  
  for speech_file in speeches:                          # access list of files in speeches for one language at a time
    with open(speech_file, 'r') as speech:              # read file
      speech = " ".join([str(line) for line in speech]) # each speech file becomes one string

      if speech_file[STRIP_INDEX:] in sample_corpus:
        each_lang.append(speech)                        # append string version of speech file
      else:
        just_train.append(speech)                       # add to train set

  parallel_speeches[lang] = each_lang                   # add list of speeches for every language
  train_speeches[lang] = just_train                     # add to train set

*Split Parallel Corpus Into Test and Train*

In [12]:
# Imports
import pandas as pd
from pprint import pprint
import re

# Selecting Train speeches
hindi_unprep = pd.DataFrame(list for list in train_speeches['hi'])
english_unprep = pd.DataFrame(list for list in train_speeches['en'])

# We select speeches with at least 500 tokens
# TOKENS_LIMIT = 500
# remove = []
# for speech in hindi_unprep[:5].itertuples():
#   if len(speech[1].split(" "))  < TOKENS_LIMIT:
#     remove.append(speech[0])
#     print(speech[0], " removed!")

# View each row as a speech
print(hindi_unprep[:5])
print(english_unprep[:5])

                                                   0
0  नागरिकों से स्‍वच्‍छाग्रही बनने और स्‍वच्‍छ भा...
1  श्री सोमनाथ न्‍यास के न्‍यासियों की 116वीं बैठ...
2  प्रधानमंत्री श्री नरेन्द्र मोदी की अध्यक्षता म...
3  प्रधानमंत्री श्री नरेन्द्र मोदी ने आज ही के दि...
4  · खूंटी की जिला अदालत में छत पर लगने वाले सौर ...
                                                   0
0  PM Calls upon citizens to become Swachhagrahis...
1  The first 10 months of Prime Minister Narendra...
2  BRICS in Africa: Collaboration for Inclusive G...
3  The 116th meeting of the trustees of Shri Somn...
4  Deendayal Upadhyaya Gram Jyoti Yojana\n The Un...


## Training Models

In [13]:
# Imports
from contextualized_topic_models.models.ctm import ZeroShotTM
import contextualized_topic_models.utils.data_preparation
from contextualized_topic_models.utils.data_preparation import TopicModelDataPreparation
from contextualized_topic_models.utils.preprocessing import WhiteSpacePreprocessing
import nltk
import pickle

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
import string
from nltk.corpus import stopwords as stop_words
from gensim.utils import deaccent
import warnings
from inltk.inltk import remove_foreign_languages


class WhiteSpacePreprocessingStopwords(WhiteSpacePreprocessingStopwords):
    """
    Overriden author's original code to keep accents during preprocessing.
    Provides a very simple preprocessing script that filters infrequent tokens from text
    """
    def preprocess(self):
        """
        Note that if after filtering some documents do not contain words we remove them. That is why we return also the
        list of unpreprocessed documents.
        :return: preprocessed documents, unpreprocessed documents and the vocabulary list
        """
        from indicnlp.tokenize import indic_tokenize 

        preprocessed_docs_tmp = self.documents

        # REMOVE DEACCENT
        preprocessed_docs_tmp = [doc.lower() for doc in preprocessed_docs_tmp]
        preprocessed_docs_tmp = [doc.translate(
            str.maketrans(string.punctuation, ' ' * len(string.punctuation))) for doc in preprocessed_docs_tmp]

        if self.remove_numbers:
            preprocessed_docs_tmp = [doc.translate(str.maketrans("0123456789", ' ' * len("0123456789")))
                                     for doc in preprocessed_docs_tmp]

        # REMOVE PUNCTUATION CHARACTERS NOT COVERED BY string.punctuation                             
        preprocessed_docs_tmp = [doc.translate(
            str.maketrans("_–‘’\'':.,*।\n", ' ' * len("_–‘’\'':.,*।\n"))) for doc in preprocessed_docs_tmp]
                                               
        # REPLACED COUNTVECTORIZER WITH INDIC TOKENIZE'S TRIVIAL TOKENIZATION 
        preprocessed_docs_tmp = [' '.join([w for w in indic_tokenize.trivial_tokenize(doc) if len(w) > 0 and w not in self.stopwords]) for doc in preprocessed_docs_tmp]

        preprocessed_docs, unpreprocessed_docs, retained_indices = [], [], []
        for i, doc in enumerate(preprocessed_docs_tmp):
            if len(doc) > 0 and len(doc) >= self.min_words:
                preprocessed_docs.append(doc)
                unpreprocessed_docs.append(self.documents[i])
                retained_indices.append(i)

        vocabulary = list(set([item for doc in preprocessed_docs for item in doc.split()]))

        return preprocessed_docs, unpreprocessed_docs, vocabulary, retained_indices

###**Hindi-Based ZeroshotTM**

*Preprocessing*

Why do we use the preprocessed text here? We need text without punctuation to build the bag of word. Also, we might want only to have the most frequent words inside the BoW. Too many words might not help.

In [15]:
### HINDI ###
LANG_SELECTED = 'hi'

# We select 500 tokens per speech
NUM_TOKENS = 500

# Import Hindi Stopwords
import stopwordsiso as stopwords

# Run preprocessing script
documents = [line[:NUM_TOKENS].strip() for line in train_speeches[LANG_SELECTED]]
sp = WhiteSpacePreprocessingStopwords(documents, stopwords_list = stopwords.stopwords(LANG_SELECTED))
preprocessed_documents, unpreprocessed_corpus, vocab, __= sp.preprocess()

# Ensure same length for preprocessed and unpreprocessed
print(len(preprocessed_documents), len(unpreprocessed_corpus))

4003 4003


In [16]:
from pprint import pprint
pprint(preprocessed_documents[:5])

['नागरिकों स्\u200dवच्\u200dछाग्रही बनने स्\u200dवच्\u200dछ भारत बनाने आह्वान '
 'प्रधानमंत्री श्री नरेन्\u200dद्र मोदी चंपारण महात्\u200dमा गांधी '
 'सत्\u200dयाग्रह प्रयोग साल पूरे अवसर राष्\u200dट्रीय राजधानी कल '
 'स्\u200dवच्\u200dछाग्रह बापू करयां\u200dजलि अभियान प्रदर्शनी उद्घाटन करेंगे '
 'दौरान भारतीय राष्\u200dट्रीय अभिलेखागार आयोजित ऑनलाइन इंटरएक्टिव क्विज '
 'शुभारंभ करेंगे प्रधानमंत्री ट्वीट्स श्रृंखला बारे बताते चंपारण '
 'सत्\u200dयाग्रह ऐ',
 'श्री सोमनाथ न्\u200dयास न्\u200dयासियों वीं बैठक आज सोमनाथ '
 'सम्\u200dपन्\u200dन बैठक न्\u200dयासियों प्रधानमंत्री श्री नरेंद्र मोदी श्री '
 'लालकृष्\u200dण आडवाणी श्री अमित शाह श्री केशू भाई पटेल श्री पी लाहडी श्री जे '
 'डी परमार श्री हर्ष नेवतिया भाग लिया बैठक श्री नरेन्\u200dद्र मोदी सुझाव श्री '
 'सोमनाथ मंदिर सम्\u200dपूर्ण परिसर जल हरियाली अन्\u200dय सुविधाओं '
 'उन्\u200dनयन जाए उन्\u200dहोंने सिफारिश वेरावल प्रभास पाटन नकदी रहित बनाने '
 'न्\u200dयास सक्र',
 'प्रधानमंत्री श्री नरेन्द्र मोदी अध्यक्षता मंत्रीमंडल बैठक दीनद

We don't discard the non-preprocessed hindi texts, because we are going to use them as input for obtaining the **contextualized** document representations. We will now pass our files with preprocess and unpreprocessed data to our TopicModelDataPreparation object. This object takes care of creating the bag of words and obtains the contextualized BERT representations of documents. This operation allows us to create our training dataset.

Note: Here we use the contextualized model "ai4bharat/indic-bert", because we need a multilingual model for indic languages for performing cross-lingual predictions later.



*Training ZeroshotTM*

In [17]:
# Override utility functions for TopicModelDataPreparation to support SentenceTransformers built from scratch
from sentence_transformers import SentenceTransformer, models
import numpy as np

def bert_embeddings_from_file(text_file, sbert_model_to_load, custom=False, tokenizer_args=None, model_args=None, batch_size=200):
    """
    Creates SBERT Embeddings from an input file
    """
    from sentence_transformers import SentenceTransformer, models

    # For indicBERT
    if sbert_model_to_load == 'ai4bharat/indic-bert':
      custom = True
      tokenizer_args = {"keep_accents": True}

    # If build custom sentence transformer model
    if custom is True:
      if tokenizer_args and model_args:   # both set of arguments given in dict to pass into hugging face model
        word_embedding_model = models.Transformer(sbert_model_to_load, 
                                                  tokenizer_args=tokenizer_args, 
                                                  model_args = model_args)
      
      elif tokenizer_args:   # tokenizer arguments given in dict to pass into hugging face model
        word_embedding_model = models.Transformer(sbert_model_to_load, tokenizer_args = tokenizer_args)

      elif model_args:   # model arguments given in dict to pass into hugging face model
        word_embedding_model = models.Transformer(sbert_model_to_load, model_args = model_args)
      
      else:
        word_embedding_model = models.Transformer(sbert_model_to_load)

      # Pass modules to build sentence transformer model
      pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
      model = SentenceTransformer(modules=[word_embedding_model, pooling_model])   
    
    # Else retrieve model from hugging face
    else:
        model = SentenceTransformer(sbert_model_to_load)

    with open(text_file, encoding="utf-8") as filino:
        train_text = list(map(lambda x: x, filino.readlines()))

    return np.array(model.encode(train_text, show_progress_bar=True, batch_size=batch_size))


def bert_embeddings_from_list(texts, sbert_model_to_load, custom=False, tokenizer_args=None, model_args=None, batch_size=200):
    """
    Creates SBERT Embeddings from a list
    """
    from sentence_transformers import SentenceTransformer, models

    # For indicBERT
    if sbert_model_to_load == 'ai4bharat/indic-bert':
      custom = True
      tokenizer_args = {"keep_accents": True}

    # If build custom sentence transformer model
    if custom is True:
      if tokenizer_args and model_args:   # both set of arguments given in dict to pass into hugging face model
        word_embedding_model = models.Transformer(sbert_model_to_load, 
                                                  tokenizer_args=tokenizer_args, 
                                                  model_args = model_args)
      
      elif tokenizer_args:   # tokenizer arguments given in dict to pass into hugging face model
        word_embedding_model = models.Transformer(sbert_model_to_load, tokenizer_args = tokenizer_args)

      elif model_args:   # model arguments given in dict to pass into hugging face model
        word_embedding_model = models.Transformer(sbert_model_to_load, model_args = model_args)
      
      else:
        word_embedding_model = models.Transformer(sbert_model_to_load)

      # Pass modules to build sentence transformer model
      pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
      model = SentenceTransformer(modules=[word_embedding_model, pooling_model])   

    # Else retrieve model from hugging face
    else:
        model = SentenceTransformer(sbert_model_to_load)

    return np.array(model.encode(texts, show_progress_bar=True, batch_size=batch_size))

contextualized_topic_models.utils.data_preparation.bert_embeddings_from_file = bert_embeddings_from_file
contextualized_topic_models.utils.data_preparation.bert_embeddings_from_list = bert_embeddings_from_list


In [18]:
# Load Indic Multilingual embeddings 
tp = TopicModelDataPreparation('ai4bharat/indic-bert')

# Building training dataset
training_dataset = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Downloading:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/129M [00:00<?, ?B/s]

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['predictions.dense.weight', 'predictions.decoder.weight', 'predictions.dense.bias', 'sop_classifier.classifier.bias', 'predictions.bias', 'sop_classifier.classifier.weight', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/5.38M [00:00<?, ?B/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]



In [19]:
# Train over 100 epochs

### HINDI : 25 TOPICS ###
z_ctm_25_HI = ZeroShotTM(bow_size=len(tp.vocab), n_components = 25, contextual_size=768, num_epochs=100)

z_ctm_25_HI.fit(training_dataset, n_samples = 30) # run the model
z_ctm_25_HI.save("./") # save the model

### HINDI : 50 TOPICS ###
z_ctm_50_HI = ZeroShotTM(bow_size=len(tp.vocab), n_components = 50, contextual_size=768, num_epochs=100)

z_ctm_50_HI.fit(training_dataset, n_samples = 30) # run the model
z_ctm_50_HI.save("./") # save the model

Epoch: [100/100]	 Seen Samples: [400300/400300]	Train Loss: 296.6654189045716	Time: 0:00:00.837908: : 100it [01:22,  1.21it/s]
Sampling: [30/30]: : 30it [00:22,  1.32it/s]
Epoch: [100/100]	 Seen Samples: [400300/400300]	Train Loss: 303.732471642362	Time: 0:00:00.860216: : 100it [01:24,  1.18it/s]
Sampling: [30/30]: : 30it [00:23,  1.28it/s]


In [20]:
# See topic predictions per speech doc
z_ctm_25_HI.get_topic_lists(5)[:4]

[['ठक', 'सम', 'रगत', 'सच', 'तर'],
 ['मह', 'रपत', 'सदस', 'गत', 'आपक'],
 ['उपसम', 'एफडब', 'एएनआर', 'गइ', 'आधर'],
 ['पन', 'एमओय', 'सहय', 'हस', 'षत']]

In [21]:
# See topic predictions per speech doc
z_ctm_50_HI.get_topic_lists(5)[:4]

[['लय', 'शन', 'हस', 'ऑफ', 'एमओय'],
 ['उन', 'अवसर', 'अभ', 'कल', 'जय'],
 ['यद', 'अगर', 'बत', 'इतन', 'ऐस'],
 ['इसर', 'उपग', 'पण', 'सफल', 'ईएमय']]

In [22]:
# Get NPMI Coherence
from contextualized_topic_models.evaluation.measures import CoherenceNPMI
texts = [doc.split() for doc in preprocessed_documents] # load text for NPMI

### 25 TOPICS ###
npmi_HI = CoherenceNPMI(texts=texts, topics=z_ctm_25_HI.get_topic_lists(25))
print(npmi_HI.score())

### 50 TOPICS ###
npmi_50_HI = CoherenceNPMI(texts=texts, topics=z_ctm_50_HI.get_topic_lists(50))
print(npmi_50_HI.score())

# Store NPMI scores
zeroshotNPMI_HI = [npmi_HI.score(), npmi_50_HI.score()]

-0.3133134617646065
-0.31060041527417853


###**English-Based ZeroshotTM**

*Preprocessing*

Why do we use the preprocessed text here? We need text without punctuation to build the bag of word. Also, we might want only to have the most frequent words inside the BoW. Too many words might not help.

In [23]:
### ENGLISH ###
LANG_SELECTED = 'en'

# We select 500 tokens per speech
NUM_TOKENS = 500

# Download English Stopwords
nltk.download('stopwords')

# Run preprocessing script
documents = [line[:NUM_TOKENS].strip() for line in train_speeches[LANG_SELECTED]]

import nltk
nltk.download('words')
# preprocessed_documents
words = set(nltk.corpus.words.words())

documents = ["".join([word for word in doc if word.lower() in words or not word.isalpha()]) for doc in documents]

sp = WhiteSpacePreprocessing(documents, stopwords_language='english')
preprocessed_documents, unpreprocessed_corpus, vocab = sp.preprocess()

# Ensure same length for preprocessed and unpreprocessed
print(len(preprocessed_documents), len(unpreprocessed_corpus))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


  return concat([self.open(f).read() for f in fileids])
  return concat([self.open(f).read() for f in fileids])
  return concat([self.open(f).read() for f in fileids])


4219 4219




We don't discard the non-preprocessed english texts, because we are going to use them as input for obtaining the **contextualized** document representations. We will now pass our files with preprocess and unpreprocessed data to our TopicModelDataPreparation object. This object takes care of creating the bag of words and obtains the contextualized BERT representations of documents. This operation allows us to create our training dataset.

Note: Here we use the contextualized model "ai4bharat/indic-bert", because we need a multilingual model for indic languages for performing cross-lingual predictions later.



In [24]:
# Building training dataset
tp = TopicModelDataPreparation("ai4bharat/indic-bert")
en_training = tp.fit(text_for_contextual=unpreprocessed_corpus, text_for_bow=preprocessed_documents)

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['predictions.dense.weight', 'predictions.decoder.weight', 'predictions.dense.bias', 'sop_classifier.classifier.bias', 'predictions.bias', 'sop_classifier.classifier.weight', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/22 [00:00<?, ?it/s]



*Training ZeroshotTM*

In [25]:
# Train over 100 epochs

### ENGLISH : 25 TOPICS ###
z_ctm_25_EN = ZeroShotTM(bow_size=len(tp.vocab), n_components = 25, contextual_size=768, num_epochs=100)
z_ctm_25_EN.fit(en_training, n_samples=30) # run the model
z_ctm_25_EN.save("./") # save the model

### ENGLISH : 50 TOPICS ###
z_ctm_50_EN = ZeroShotTM(bow_size=len(tp.vocab), n_components = 50, contextual_size=768, num_epochs=100)
z_ctm_50_EN.fit(en_training, n_samples=30) # run the model
z_ctm_50_EN.save("./") # save the model

Epoch: [100/100]	 Seen Samples: [421900/421900]	Train Loss: 236.97698724334853	Time: 0:00:00.867364: : 100it [01:28,  1.13it/s]
Sampling: [30/30]: : 30it [00:24,  1.25it/s]
Epoch: [100/100]	 Seen Samples: [421900/421900]	Train Loss: 243.87289642095283	Time: 0:00:00.869962: : 100it [01:28,  1.13it/s]
Sampling: [30/30]: : 30it [00:24,  1.20it/s]


In [26]:
# See topic predictions per speech doc
z_ctm_25_EN.get_topic_lists(5)[:4]

[['nuclear', 'ble', 'cancer', '20th', 'certificates'],
 ['said', 'today', 'delhi', 'new', 'addressed'],
 ['ble', 'cancer', '20th', 'bimstec', 'undertaken'],
 ['day', 'congratulated', 'proud', 'scientists', 'winning']]

In [27]:
# See topic predictions per speech doc
z_ctm_50_EN.get_topic_lists(5)[:4]

[['lives', 'families', 'loss', 'bus', 'thoughts'],
 ['cabinet', 'chaired', 'union', 'approval', 'approved'],
 ['minister', 'assured', 'situation', 'arising', 'stock'],
 ['visit', 'india', 'president', 'bilateral', 'mr']]

In [28]:
# Get NPMI Coherence
from contextualized_topic_models.evaluation.measures import CoherenceNPMI
texts = [doc.split() for doc in preprocessed_documents] # load text for NPMI

### 25 TOPICS ###-
npmi_EN = CoherenceNPMI(texts=texts, topics=z_ctm_25_EN.get_topic_lists(25))
print(npmi_EN.score())

### 50 TOPICS ###
npmi_50_EN = CoherenceNPMI(texts=texts, topics=z_ctm_50_EN.get_topic_lists(50))
print(npmi_50_EN.score())

# Store NPMI scores
zeroshotNPMI_EN = [npmi_EN.score(), npmi_50_EN.score()]

0.07190399877114767
0.09796218295846393


# Coherence Results

In [29]:
# SHOW RESULTS
NPMI = {"ZeroShotTM for Hindi" : zeroshotNPMI_HI,
        "ZeroShotTM for English" : zeroshotNPMI_EN}

npmi = pd.DataFrame.from_dict(NPMI, orient='index')
print("NPMI Coherences")
npmi.set_axis(["t(25)", "t(50)"], axis = 1)

NPMI Coherences


Unnamed: 0,t(25),t(50)
ZeroShotTM for Hindi,-0.313313,-0.3106
ZeroShotTM for English,0.071904,0.097962


# Predictions and Evaluation
###**Unseen Multilingual  Corpora Predictions**

*Languages*

* Assamese - as
* Bengali - bn
* English - en
* Gujarati - gu
* Hindi - hi
* Kannada - kn
* Malayalam - ml
* Marathi - mr
* Oriya - or
* Punjabi - pa
* Tamil - ta
* Telugu - te

In [None]:
# Convert test files into test datasets
as_testset = tp.transform(parallel_speeches['as'])
bn_testset = tp.transform(parallel_speeches['bn'])
en_testset = tp.transform(parallel_speeches['en'])
gu_testset = tp.transform(parallel_speeches['gu'])
hi_testset = tp.transform(parallel_speeches['hi'])
kn_testset = tp.transform(parallel_speeches['kn'])
ml_testset = tp.transform(parallel_speeches['ml'])
mr_testset = tp.transform(parallel_speeches['mr'])
or_testset = tp.transform(parallel_speeches['or'])
pa_testset = tp.transform(parallel_speeches['pa'])
ta_testset = tp.transform(parallel_speeches['ta'])
te_testset = tp.transform(parallel_speeches['te'])

###**Topic Predictions**

*Hindi*

In [31]:
### HINDI : 25 TOPIC PREDICTIONS ### 
as_topics_predictions = z_ctm_25_HI.get_thetas(as_testset, n_samples=100) # get all the topic predictions
bn_topics_predictions = z_ctm_25_HI.get_thetas(bn_testset, n_samples=100) # get all the topic predictions
en_topics_predictions = z_ctm_25_HI.get_thetas(en_testset, n_samples=100) # get all the topic predictions
gu_topics_predictions = z_ctm_25_HI.get_thetas(gu_testset, n_samples=100) # get all the topic predictions
hi_topics_predictions = z_ctm_25_HI.get_thetas(hi_testset, n_samples=100) # get all the topic predictions
kn_topics_predictions = z_ctm_25_HI.get_thetas(kn_testset, n_samples=100) # get all the topic predictions
ml_topics_predictions = z_ctm_25_HI.get_thetas(ml_testset, n_samples=100) # get all the topic predictions
mr_topics_predictions = z_ctm_25_HI.get_thetas(mr_testset, n_samples=100) # get all the topic predictions
or_topics_predictions = z_ctm_25_HI.get_thetas(or_testset, n_samples=100) # get all the topic predictions
pa_topics_predictions = z_ctm_25_HI.get_thetas(pa_testset, n_samples=100) # get all the topic predictions
ta_topics_predictions = z_ctm_25_HI.get_thetas(ta_testset, n_samples=100) # get all the topic predictions
te_topics_predictions = z_ctm_25_HI.get_thetas(te_testset, n_samples=100) # get all the topic predictions

topics_25_HI = {'as': as_topics_predictions, 'bn': bn_topics_predictions, 
             'en': en_topics_predictions, 'gu': gu_topics_predictions,
             'hi': hi_topics_predictions, 'kn': kn_topics_predictions,
             'ml': ml_topics_predictions, 'mr': mr_topics_predictions,
             'or': or_topics_predictions, 'pa': pa_topics_predictions,
             'ta': ta_topics_predictions, 'te': te_topics_predictions}

Sampling: [100/100]: : 100it [00:40,  2.46it/s]
Sampling: [100/100]: : 100it [00:40,  2.44it/s]
Sampling: [100/100]: : 100it [00:41,  2.42it/s]
Sampling: [100/100]: : 100it [00:41,  2.39it/s]
Sampling: [100/100]: : 100it [00:42,  2.36it/s]
Sampling: [100/100]: : 100it [00:43,  2.30it/s]
Sampling: [100/100]: : 100it [00:43,  2.28it/s]
Sampling: [100/100]: : 100it [00:44,  2.26it/s]
Sampling: [100/100]: : 100it [00:45,  2.18it/s]
Sampling: [100/100]: : 100it [00:45,  2.18it/s]
Sampling: [100/100]: : 100it [00:45,  2.19it/s]
Sampling: [100/100]: : 100it [00:46,  2.16it/s]


In [32]:
### HINDI : 50 TOPIC PREDICTIONS ### 
as_topics_predictions = z_ctm_50_HI.get_thetas(as_testset, n_samples=100) # get all the topic predictions
bn_topics_predictions = z_ctm_50_HI.get_thetas(bn_testset, n_samples=100) # get all the topic predictions
en_topics_predictions = z_ctm_50_HI.get_thetas(en_testset, n_samples=100) # get all the topic predictions
gu_topics_predictions = z_ctm_50_HI.get_thetas(gu_testset, n_samples=100) # get all the topic predictions
hi_topics_predictions = z_ctm_50_HI.get_thetas(hi_testset, n_samples=100) # get all the topic predictions
kn_topics_predictions = z_ctm_50_HI.get_thetas(kn_testset, n_samples=100) # get all the topic predictions
ml_topics_predictions = z_ctm_50_HI.get_thetas(ml_testset, n_samples=100) # get all the topic predictions
mr_topics_predictions = z_ctm_50_HI.get_thetas(mr_testset, n_samples=100) # get all the topic predictions
or_topics_predictions = z_ctm_50_HI.get_thetas(or_testset, n_samples=100) # get all the topic predictions
pa_topics_predictions = z_ctm_50_HI.get_thetas(pa_testset, n_samples=100) # get all the topic predictions
ta_topics_predictions = z_ctm_50_HI.get_thetas(ta_testset, n_samples=100) # get all the topic predictions
te_topics_predictions = z_ctm_50_HI.get_thetas(te_testset, n_samples=100) # get all the topic predictions

topics_50_HI = {'as': as_topics_predictions, 'bn': bn_topics_predictions, 
             'en': en_topics_predictions, 'gu': gu_topics_predictions,
             'hi': hi_topics_predictions, 'kn': kn_topics_predictions,
             'ml': ml_topics_predictions, 'mr': mr_topics_predictions,
             'or': or_topics_predictions, 'pa': pa_topics_predictions,
             'ta': ta_topics_predictions, 'te': te_topics_predictions}

Sampling: [100/100]: : 100it [00:47,  2.11it/s]
Sampling: [100/100]: : 100it [00:47,  2.09it/s]
Sampling: [100/100]: : 100it [00:48,  2.07it/s]
Sampling: [100/100]: : 100it [00:49,  2.03it/s]
Sampling: [100/100]: : 100it [00:49,  2.03it/s]
Sampling: [100/100]: : 100it [00:49,  2.01it/s]
Sampling: [100/100]: : 100it [00:50,  1.97it/s]
Sampling: [100/100]: : 100it [00:51,  1.95it/s]
Sampling: [100/100]: : 100it [00:51,  1.92it/s]
Sampling: [100/100]: : 100it [00:53,  1.88it/s]
Sampling: [100/100]: : 100it [00:53,  1.86it/s]
Sampling: [100/100]: : 100it [00:54,  1.83it/s]


*English*

In [33]:
### ENGLISH : 25 TOPIC PREDICTIONS ### 
as_topics_predictions = z_ctm_25_EN.get_thetas(as_testset, n_samples=100) # get all the topic predictions
bn_topics_predictions = z_ctm_25_EN.get_thetas(bn_testset, n_samples=100) # get all the topic predictions
en_topics_predictions = z_ctm_25_EN.get_thetas(en_testset, n_samples=100) # get all the topic predictions
gu_topics_predictions = z_ctm_25_EN.get_thetas(gu_testset, n_samples=100) # get all the topic predictions
hi_topics_predictions = z_ctm_25_EN.get_thetas(hi_testset, n_samples=100) # get all the topic predictions
kn_topics_predictions = z_ctm_25_EN.get_thetas(kn_testset, n_samples=100) # get all the topic predictions
ml_topics_predictions = z_ctm_25_EN.get_thetas(ml_testset, n_samples=100) # get all the topic predictions
mr_topics_predictions = z_ctm_25_EN.get_thetas(mr_testset, n_samples=100) # get all the topic predictions
or_topics_predictions = z_ctm_25_EN.get_thetas(or_testset, n_samples=100) # get all the topic predictions
pa_topics_predictions = z_ctm_25_EN.get_thetas(pa_testset, n_samples=100) # get all the topic predictions
ta_topics_predictions = z_ctm_25_EN.get_thetas(ta_testset, n_samples=100) # get all the topic predictions
te_topics_predictions = z_ctm_25_EN.get_thetas(te_testset, n_samples=100) # get all the topic predictions

topics_25_EN = {'as': as_topics_predictions, 'bn': bn_topics_predictions, 
             'en': en_topics_predictions, 'gu': gu_topics_predictions,
             'hi': hi_topics_predictions, 'kn': kn_topics_predictions,
             'ml': ml_topics_predictions, 'mr': mr_topics_predictions,
             'or': or_topics_predictions, 'pa': pa_topics_predictions,
             'ta': ta_topics_predictions, 'te': te_topics_predictions}

Sampling: [100/100]: : 100it [00:54,  1.83it/s]
Sampling: [100/100]: : 100it [00:55,  1.79it/s]
Sampling: [100/100]: : 100it [00:56,  1.77it/s]
Sampling: [100/100]: : 100it [00:57,  1.74it/s]
Sampling: [100/100]: : 100it [00:57,  1.73it/s]
Sampling: [100/100]: : 100it [00:57,  1.73it/s]
Sampling: [100/100]: : 100it [00:58,  1.70it/s]
Sampling: [100/100]: : 100it [00:59,  1.68it/s]
Sampling: [100/100]: : 100it [01:00,  1.66it/s]
Sampling: [100/100]: : 100it [01:00,  1.65it/s]
Sampling: [100/100]: : 100it [01:00,  1.64it/s]
Sampling: [100/100]: : 100it [01:01,  1.62it/s]


In [34]:
### ENGLISH : 50 TOPIC PREDICTIONS ### 
as_topics_predictions = z_ctm_50_EN.get_thetas(as_testset, n_samples=100) # get all the topic predictions
bn_topics_predictions = z_ctm_50_EN.get_thetas(bn_testset, n_samples=100) # get all the topic predictions
en_topics_predictions = z_ctm_50_EN.get_thetas(en_testset, n_samples=100) # get all the topic predictions
gu_topics_predictions = z_ctm_50_EN.get_thetas(gu_testset, n_samples=100) # get all the topic predictions
hi_topics_predictions = z_ctm_50_EN.get_thetas(hi_testset, n_samples=100) # get all the topic predictions
kn_topics_predictions = z_ctm_50_EN.get_thetas(kn_testset, n_samples=100) # get all the topic predictions
ml_topics_predictions = z_ctm_50_EN.get_thetas(ml_testset, n_samples=100) # get all the topic predictions
mr_topics_predictions = z_ctm_50_EN.get_thetas(mr_testset, n_samples=100) # get all the topic predictions
or_topics_predictions = z_ctm_50_EN.get_thetas(or_testset, n_samples=100) # get all the topic predictions
pa_topics_predictions = z_ctm_50_EN.get_thetas(pa_testset, n_samples=100) # get all the topic predictions
ta_topics_predictions = z_ctm_50_EN.get_thetas(ta_testset, n_samples=100) # get all the topic predictions
te_topics_predictions = z_ctm_50_EN.get_thetas(te_testset, n_samples=100) # get all the topic predictions

topics_50_EN = {'as': as_topics_predictions, 'bn': bn_topics_predictions, 
             'en': en_topics_predictions, 'gu': gu_topics_predictions,
             'hi': hi_topics_predictions, 'kn': kn_topics_predictions,
             'ml': ml_topics_predictions, 'mr': mr_topics_predictions,
             'or': or_topics_predictions, 'pa': pa_topics_predictions,
             'ta': ta_topics_predictions, 'te': te_topics_predictions}

Sampling: [100/100]: : 100it [01:03,  1.57it/s]
Sampling: [100/100]: : 100it [01:05,  1.53it/s]
Sampling: [100/100]: : 100it [01:05,  1.54it/s]
Sampling: [100/100]: : 100it [01:04,  1.56it/s]
Sampling: [100/100]: : 100it [01:05,  1.53it/s]
Sampling: [100/100]: : 100it [01:05,  1.52it/s]
Sampling: [100/100]: : 100it [01:06,  1.50it/s]
Sampling: [100/100]: : 100it [01:07,  1.49it/s]
Sampling: [100/100]: : 100it [01:07,  1.47it/s]
Sampling: [100/100]: : 100it [01:08,  1.46it/s]
Sampling: [100/100]: : 100it [01:09,  1.45it/s]
Sampling: [100/100]: : 100it [01:10,  1.43it/s]


### **Quantitative Evaluation**

In [35]:
# Import metrics
from contextualized_topic_models.evaluation.measures import Matches, KLDivergence, CentroidDistance
import warnings
warnings.filterwarnings('ignore')

1. **Matches**

> Matches is the % of times the predicted topic for the non-English test document is the same as for the respective test document in English. The higher the scores, the better.

*Hindi*

In [36]:
# HINDI : Matches for 25 topics
hi_as_matches = Matches(topics_25_HI['hi'], topics_25_HI['as'])
hi_bn_matches = Matches(topics_25_HI['hi'], topics_25_HI['bn'])
hi_en_matches = Matches(topics_25_HI['hi'], topics_25_HI['en'])
hi_gu_matches = Matches(topics_25_HI['hi'], topics_25_HI['gu'])
hi_kn_matches = Matches(topics_25_HI['hi'], topics_25_HI['kn'])
hi_ml_matches = Matches(topics_25_HI['hi'], topics_25_HI['ml'])
hi_mr_matches = Matches(topics_25_HI['hi'], topics_25_HI['mr'])
hi_or_matches = Matches(topics_25_HI['hi'], topics_25_HI['or'])
hi_pa_matches = Matches(topics_25_HI['hi'], topics_25_HI['pa'])
hi_ta_matches = Matches(topics_25_HI['hi'], topics_25_HI['ta'])
hi_te_matches = Matches(topics_25_HI['hi'], topics_25_HI['te'])


matches_25_HI = {'as': hi_as_matches.score(), 'bn': hi_bn_matches.score(), 
             'en': hi_en_matches.score(), 'gu': hi_gu_matches.score(),
             'kn': hi_kn_matches.score(),
             'ml': hi_ml_matches.score(), 'mr': hi_mr_matches.score(),
             'or': hi_or_matches.score(), 'pa': hi_pa_matches.score(),
             'ta': hi_ta_matches.score(), 'te': hi_te_matches.score()}
matches_25_HI

{'as': 0.2525,
 'bn': 0.30875,
 'en': 0.0375,
 'gu': 0.4775,
 'kn': 0.24125,
 'ml': 0.09125,
 'mr': 0.38125,
 'or': 0.31875,
 'pa': 0.47125,
 'ta': 0.09625,
 'te': 0.14375}

In [37]:
# HINDI : Matches for 50 topics
hi_as_matches = Matches(topics_50_HI['hi'], topics_50_HI['as'])
hi_bn_matches = Matches(topics_50_HI['hi'], topics_50_HI['bn'])
hi_en_matches = Matches(topics_50_HI['hi'], topics_50_HI['en'])
hi_gu_matches = Matches(topics_50_HI['hi'], topics_50_HI['gu'])
hi_kn_matches = Matches(topics_50_HI['hi'], topics_50_HI['kn'])
hi_ml_matches = Matches(topics_50_HI['hi'], topics_50_HI['ml'])
hi_mr_matches = Matches(topics_50_HI['hi'], topics_50_HI['mr'])
hi_or_matches = Matches(topics_50_HI['hi'], topics_50_HI['or'])
hi_pa_matches = Matches(topics_50_HI['hi'], topics_50_HI['pa'])
hi_ta_matches = Matches(topics_50_HI['hi'], topics_50_HI['ta'])
hi_te_matches = Matches(topics_50_HI['hi'], topics_50_HI['te'])


matches_50_HI = {'as': hi_as_matches.score(), 'bn': hi_bn_matches.score(), 
             'en': hi_en_matches.score(), 'gu': hi_gu_matches.score(),
             'kn': hi_kn_matches.score(),
             'ml': hi_ml_matches.score(), 'mr': hi_mr_matches.score(),
             'or': hi_or_matches.score(), 'pa': hi_pa_matches.score(),
             'ta': hi_ta_matches.score(), 'te': hi_te_matches.score()}
matches_50_HI

{'as': 0.10625,
 'bn': 0.15,
 'en': 0.04,
 'gu': 0.28875,
 'kn': 0.12875,
 'ml': 0.05625,
 'mr': 0.23,
 'or': 0.23375,
 'pa': 0.34625,
 'ta': 0.06125,
 'te': 0.06625}

*English*

In [38]:
# ENGLISH : Matches for 25 topics
en_as_matches = Matches(topics_25_EN['en'], topics_25_EN['as'])
en_bn_matches = Matches(topics_25_EN['en'], topics_25_EN['bn'])
en_hi_matches = Matches(topics_25_EN['en'], topics_25_EN['hi'])
en_gu_matches = Matches(topics_25_EN['en'], topics_25_EN['gu'])
en_kn_matches = Matches(topics_25_EN['en'], topics_25_EN['kn'])
en_ml_matches = Matches(topics_25_EN['en'], topics_25_EN['ml'])
en_mr_matches = Matches(topics_25_EN['en'], topics_25_EN['mr'])
en_or_matches = Matches(topics_25_EN['en'], topics_25_EN['or'])
en_pa_matches = Matches(topics_25_EN['en'], topics_25_EN['pa'])
en_ta_matches = Matches(topics_25_EN['en'], topics_25_EN['ta'])
en_te_matches = Matches(topics_25_EN['en'], topics_25_EN['te'])


matches_25_EN = {'as': en_as_matches.score(), 'bn': en_bn_matches.score(), 
             'en': en_hi_matches.score(), 'gu': en_gu_matches.score(),
             'kn': en_kn_matches.score(),
             'ml': en_ml_matches.score(), 'mr': en_mr_matches.score(),
             'or': en_or_matches.score(), 'pa': en_pa_matches.score(),
             'ta': en_ta_matches.score(), 'te': en_te_matches.score()}
matches_25_EN

{'as': 0.24,
 'bn': 0.255,
 'en': 0.37875,
 'gu': 0.435,
 'kn': 0.3025,
 'ml': 0.1975,
 'mr': 0.27125,
 'or': 0.275,
 'pa': 0.4375,
 'ta': 0.1575,
 'te': 0.17875}

In [39]:
# ENGLISH : Matches for 50 topics
en_as_matches = Matches(topics_50_EN['en'], topics_50_EN['as'])
en_bn_matches = Matches(topics_50_EN['en'], topics_50_EN['bn'])
en_en_matches = Matches(topics_50_EN['en'], topics_50_EN['hi'])
en_gu_matches = Matches(topics_50_EN['en'], topics_50_EN['gu'])
en_kn_matches = Matches(topics_50_EN['en'], topics_50_EN['kn'])
en_ml_matches = Matches(topics_50_EN['en'], topics_50_EN['ml'])
en_mr_matches = Matches(topics_50_EN['en'], topics_50_EN['mr'])
en_or_matches = Matches(topics_50_EN['en'], topics_50_EN['or'])
en_pa_matches = Matches(topics_50_EN['en'], topics_50_EN['pa'])
en_ta_matches = Matches(topics_50_EN['en'], topics_50_EN['ta'])
en_te_matches = Matches(topics_50_EN['en'], topics_50_EN['te'])


matches_50_EN = {'as': en_as_matches.score(), 'bn': en_bn_matches.score(), 
             'en': en_hi_matches.score(), 'gu': en_gu_matches.score(),
             'kn': en_kn_matches.score(),
             'ml': en_ml_matches.score(), 'mr': en_mr_matches.score(),
             'or': en_or_matches.score(), 'pa': en_pa_matches.score(),
             'ta': en_ta_matches.score(), 'te': en_te_matches.score()}
matches_50_EN

{'as': 0.1475,
 'bn': 0.1625,
 'en': 0.37875,
 'gu': 0.28875,
 'kn': 0.1925,
 'ml': 0.09125,
 'mr': 0.1875,
 'or': 0.19,
 'pa': 0.34,
 'ta': 0.0675,
 'te': 0.115}

2. **Distributional Similarity**
> Compute the KL divergence between the predicted topic distribution on the test document and the same test document in English. Lower scores are better, indicating that the distributions do not differ by much.

*Hindi*

In [40]:
# HINDI : KL Divergence for 25 topics
hi_as_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['as'])
hi_bn_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['bn'])
hi_en_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['en'])
hi_gu_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['gu'])
hi_kn_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['kn'])
hi_ml_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['ml'])
hi_mr_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['mr'])
hi_or_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['or'])
hi_pa_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['pa'])
hi_ta_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['ta'])
hi_te_kl = KLDivergence(topics_25_HI['hi'], topics_25_HI['te'])

kl_divergence_25_HI = {'as': hi_as_kl.score(), 'bn': hi_bn_kl.score(), 
             'en': hi_en_kl.score(), 'gu': hi_gu_kl.score(),
             'kn': hi_kn_kl.score(),
             'ml': hi_ml_kl.score(), 'mr': hi_mr_kl.score(),
             'or': hi_or_kl.score(), 'pa': hi_pa_kl.score(),
             'ta': hi_ta_kl.score(), 'te': hi_te_kl.score()}

kl_divergence_25_HI

{'as': 0.6725276656788808,
 'bn': 0.8627264330569175,
 'en': 7.858056221060294,
 'gu': 0.3541341356827201,
 'kn': 0.8892348424435308,
 'ml': 1.5146623473210428,
 'mr': 0.6220876620603729,
 'or': 0.5582947561514637,
 'pa': 0.36843128152525806,
 'ta': 1.4094752864237194,
 'te': 0.8195502205718874}

In [41]:
# HINDI : KL Divergence for 50 topics
hi_as_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['as'])
hi_bn_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['bn'])
hi_en_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['en'])
hi_gu_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['gu'])
hi_kn_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['kn'])
hi_ml_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['ml'])
hi_mr_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['mr'])
hi_or_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['or'])
hi_pa_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['pa'])
hi_ta_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['ta'])
hi_te_kl = KLDivergence(topics_50_HI['hi'], topics_50_HI['te'])

kl_divergence_50_HI = {'as': hi_as_kl.score(), 'bn': hi_bn_kl.score(), 
             'en': hi_en_kl.score(), 'gu': hi_gu_kl.score(),
             'kn': hi_kn_kl.score(),
             'ml': hi_ml_kl.score(), 'mr': hi_mr_kl.score(),
             'or': hi_or_kl.score(), 'pa': hi_pa_kl.score(),
             'ta': hi_ta_kl.score(), 'te': hi_te_kl.score()}

kl_divergence_50_HI

{'as': 0.6648448247811163,
 'bn': 0.6682072268603605,
 'en': 7.827630413179365,
 'gu': 0.33799820575742406,
 'kn': 0.7568196267095151,
 'ml': 1.0196003953964474,
 'mr': 0.5275078150363133,
 'or': 0.45549437645320817,
 'pa': 0.3046564891573344,
 'ta': 1.0736741774066885,
 'te': 0.9940683614519996}

*English*

In [42]:
# ENGLISH : KL Divergence for 25 topics
en_as_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['as'])
en_bn_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['bn'])
en_hi_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['hi'])
en_gu_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['gu'])
en_kn_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['kn'])
en_ml_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['ml'])
en_mr_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['mr'])
en_or_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['or'])
en_pa_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['pa'])
en_ta_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['ta'])
en_te_kl = KLDivergence(topics_25_EN['en'], topics_25_EN['te'])

kl_divergence_25_EN = {'as': en_as_kl.score(), 'bn': en_bn_kl.score(), 
             'hi': en_hi_kl.score(), 'gu': en_gu_kl.score(),
             'kn': en_kn_kl.score(),
             'ml': en_ml_kl.score(), 'mr': en_mr_kl.score(),
             'or': en_or_kl.score(), 'pa': en_pa_kl.score(),
             'ta': en_ta_kl.score(), 'te': en_te_kl.score()}

kl_divergence_25_EN

{'as': 0.4417607160995153,
 'bn': 0.36924523223880285,
 'gu': 0.24243747921365444,
 'hi': 0.2729636354343089,
 'kn': 0.35162599142604684,
 'ml': 0.5285727255602641,
 'mr': 0.3594530848440848,
 'or': 0.3875130856447477,
 'pa': 0.20496066719003037,
 'ta': 0.5392350782809777,
 'te': 0.5993194518956494}

In [43]:
# ENGLISH : KL Divergence for 50 topics
en_as_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['as'])
en_bn_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['bn'])
en_hi_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['hi'])
en_gu_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['gu'])
en_kn_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['kn'])
en_ml_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['ml'])
en_mr_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['mr'])
en_or_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['or'])
en_pa_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['pa'])
en_ta_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['ta'])
en_te_kl = KLDivergence(topics_50_EN['en'], topics_50_EN['te'])

kl_divergence_50_EN = {'as': en_as_kl.score(), 'bn': en_bn_kl.score(), 
             'hi': en_hi_kl.score(), 'gu': en_gu_kl.score(),
             'kn': en_kn_kl.score(),
             'ml': en_ml_kl.score(), 'mr': en_mr_kl.score(),
             'or': en_or_kl.score(), 'pa': en_pa_kl.score(),
             'ta': en_ta_kl.score(), 'te': en_te_kl.score()}

kl_divergence_50_EN

{'as': 0.54250369426347,
 'bn': 0.5060315250027786,
 'gu': 0.3452978266719059,
 'hi': 0.33564429664393997,
 'kn': 0.4874127049552386,
 'ml': 0.7135579099614339,
 'mr': 0.49344311940679936,
 'or': 0.5106467010934378,
 'pa': 0.2711361122159386,
 'ta': 0.7307446922882802,
 'te': 0.6829491978402412}

3. **Centroid Embeddings**
> To also account for similar but not exactly equal topic predictions, we compute the centroid embeddings of the 5 words describing the predicted topic for both English and non-English documents. Then we compute the cosine similarity between those two centroids (CD).

In [42]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import KeyedVectors
import gensim.downloader as api
from scipy.spatial.distance import cosine
import abc
import numpy as np

class CD(CentroidDistance):
    """Override author's function to upgrade compatibility with Gensim 4.0.0.
    See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4."""

    def get_centroid(self, word_list):
        vector_list = []
        for word in word_list:
            if word in self.wv:   # changed from self.wv.vocab to self.wv as in Gensim 4.0.0
                vector_list.append(self.wv.get_vector(word))
        vec = sum(vector_list)
        return vec / np.linalg.norm(vec)

*Hindi*

In [44]:
# HINDI : Centroid Embeddings for 25 topics
cd_25_HI = {}

for key in topics_25_HI.keys():
  if key == 'hi':
    continue
  topic = topics_25_HI[key]
  cd = CD(doc_distribution_original_language = topics_25_HI['hi'], 
          doc_distribution_unseen_language = topic, 
          topics = z_ctm_25_HI.get_topic_lists(25),
          topk = 5)
  
  cd_25_HI[key] = cd.score()

cd_25_HI

{'as': nan,
 'bn': nan,
 'en': nan,
 'gu': nan,
 'kn': nan,
 'ml': nan,
 'mr': nan,
 'or': nan,
 'pa': nan,
 'ta': nan,
 'te': nan}

In [45]:
# HINDI : Centroid Embeddings for 50 topics
cd_50_HI = {}

for key in topics_50_HI.keys():
  if key == 'hi':
    continue
  topic = topics_50_HI[key]
  cd = CD(doc_distribution_original_language = topics_50_HI['hi'], 
          doc_distribution_unseen_language = topic, 
          topics = z_ctm_50_HI.get_topic_lists(50),
          topk = 5)
  
  cd_50_HI[key] = cd.score()
  cd = None

cd_50_HI

KeyboardInterrupt: ignored

*English*

In [None]:
# ENGLISH : Centroid Embeddings for 25 topics
cd_25_EN = {}

for key in topics_25_EN.keys():
  if key == 'en':
    continue
  topic = topics_25_EN[key]
  cd = CD(doc_distribution_original_language = topics_25_EN['en'], 
          doc_distribution_unseen_language = topic, 
          topics = z_ctm_25_EN.get_topic_lists(25),
          topk = 5)
  
  cd_25_EN[key] = cd.score()

cd_25_EN

In [None]:
# ENGLISH : Centroid Embeddings for 50 topics
cd_50_EN = {}

for key in topics_50_EN.keys():
  if key == 'hi':
    continue
  topic = topics_50_EN[key]
  cd = CD(doc_distribution_original_language = topics_50_EN['en'], 
          doc_distribution_unseen_language = topic, 
          topics = z_ctm_50_EN.get_topic_lists(50),
          topk = 5)
  
  cd_50_EN[key] = cd.score()
  cd = None

cd_50_EN

In [None]:
# Store Metrics
metrics = {
          "Hindi" : [{
                    "Mat25": matches_25_HI,
                    "KL25": kl_divergence_25_HI, 
                    "CD25": cd_25_HI, 
                    "Mat50": matches_50_HI, 
                    "KL50": kl_divergence_50_HI,
                    "CD50": cd_50_HI
                    }],

          "English": [{
                    "Mat25": matches_25_EN,
                    "KL25": kl_divergence_25_EN, 
                    "CD25": cd_25_EN, 
                    "Mat50": matches_50_EN, 
                    "KL50": kl_divergence_50_EN,
                    "CD50": cd_50_EN
                    }]
          }
with open("metrics_samescript.txt", 'wb') as F:
  pickle.dump(metrics, F)

metrics

# Evaluation Results

In [None]:
# Show results
metrics = {"Hindi" : 
           
           {"Mat25": matches_25_HI,
           "KL25": kl_divergence_25_HI, 
           "CD25": cd_25_HI, 
           "Mat50": matches_50_HI, 
           "KL50": kl_divergence_50_HI,
           "CD50": cd_50_HI},
           
           "English" : 
           {"Mat25": matches_25_EN,
           "KL25": kl_divergence_25_EN, 
           "CD25": cd_25_EN, 
           "Mat50": matches_50_EN, 
           "KL50": kl_divergence_50_EN,
           "CD50": cd_50_EN}
           }

metrics = pd.DataFrame.from_dict(metrics, orient='columns') 
print("Match, KL, and Centroid Similarity for 25 and 50 topics on various languages on PMIndia Corpus")
metrics