In [2]:
import collections
import itertools
import json
import functools

import numpy as np
import pandas as pd
import tqdm

In [3]:
data_directory = "data"

In [4]:
data = pd.read_parquet(f"{data_directory}/data.parquet").to_dict(orient="records")

In [5]:
bow = np.array([d["bow"] for d in data])

In [6]:
with open(f"{data_directory}/vocabulary.json") as f:
    vocabulary = json.load(f)

In [7]:
N_LANGUAGES = len(set(d["language"] for d in data))
N_LANGUAGES

9

In [8]:
with open(f"{data_directory}/output.jsonl") as f:
    output = [json.loads(line) for line in f]

# NPMI scoring

## Create word-frequency matrix for sentences

In [21]:
# count unique words in clean_words
words = set(itertools.chain.from_iterable(d["clean_words"] for d in data))
word2idx = {word: idx for idx, word in enumerate(words)}

# create sparse word-doc matrix
import scipy.sparse

doc_word = scipy.sparse.csr_matrix((len(data), len(words)), dtype=np.uint8)

for i, d in enumerate(tqdm.tqdm(data)):
    for word in d["clean_words"]:
        doc_word[i, word2idx[word]] += 1

  self._set_intXint(row, col, x.flat[0])
100%|██████████| 90000/90000 [22:55<00:00, 65.45it/s] 


In [22]:
# save word2idx and doc_word to disk
import pickle

with open(f"{data_directory}/word2idx.pkl", "wb") as f:
    pickle.dump(word2idx, f)

scipy.sparse.save_npz(f"{data_directory}/doc_word.npz", doc_word)

In [9]:
# load word2idx and doc_word from disk
import pickle

with open(f"{data_directory}/word2idx.pkl", "rb") as f:
    word2idx = pickle.load(f)

import scipy.sparse

doc_word = scipy.sparse.load_npz(f"{data_directory}/doc_word.npz")

## Word co-occurrence calculations

In [10]:
@functools.lru_cache(maxsize=None)
def p1(word):
    if word in vocabulary:
        value = p1_vocabulary(word)
    else:
        value = p1_all_words(word)
    return value

@functools.lru_cache(maxsize=None)
def p2(word1, word2):
    if word1 not in vocabulary or word2 not in vocabulary:
        value = p2_all_words(word1, word2)
    else:
        value = p2_vocabulary(word1, word2)
    return value


@functools.lru_cache(maxsize=None)
def p1_vocabulary(word):
    value = np.count_nonzero(bow[:, vocabulary[word]["idx"]]) / bow.shape[0]
    return value

@functools.lru_cache(maxsize=None)
def p2_vocabulary(word1, word2):
    value = np.count_nonzero(np.logical_and(bow[:, vocabulary[word1]["idx"]], bow[:, vocabulary[word2]["idx"]])) / bow.shape[0]
    return value

@functools.lru_cache(maxsize=None)
def p1_all_words(word):
    if word not in word2idx:
        return 0
    value = doc_word[:, word2idx[word]].count_nonzero() / doc_word.shape[0]
    return value

@functools.lru_cache(maxsize=None)
def p2_all_words(word1, word2):
    if word1 not in word2idx or word2 not in word2idx:
        return 0
    value = doc_word[:, word2idx[word1]].multiply(doc_word[:, word2idx[word2]]).count_nonzero()  / doc_word.shape[0]
    return value

In [11]:
def npmi_word(word1, word2):
    joint = p2(word1, word2)
    p1w1 = p1(word1)
    p1w2 = p1(word2)
    if joint == 0:
        return 0
    value = np.log(p2(word1, word2) / (p1w1 * p1w2)) / -np.log(p2(word1, word2))
    return value

def npmi_topic(topic):
    return np.mean([
        npmi_word(word1, word2)
        for word1, word2 in itertools.combinations(topic, 2)
    ])


def npmi_topics(topics):
    return np.mean([
        npmi_topic(topic)
        for topic in topics
    ])

# Word embedding score

In [12]:
import sonar.inference_pipelines.text

t2vec_model = sonar.inference_pipelines.text.TextToEmbeddingModelPipeline(
    encoder="text_sonar_basic_encoder",
    tokenizer="text_sonar_basic_encoder"
)

  from .autonotebook import tqdm as notebook_tqdm


## Get topic word vectors

In [12]:
all_topic_words = set(
    word
    for output_i in output
    for topic in output_i["topics"]
    for word in topic
)

In [11]:
topic_words_langauges = collections.defaultdict(set)

for data_i in data:
    for word in data_i["clean_words"]:
        if word in all_topic_words:
            topic_words_langauges[word].add(data_i["language"])


In [12]:
language_topic_words = {
    language: set(word for word, languages in topic_words_langauges.items() if language in languages)
    for language in set(d["language"] for d in data)
}

# covert sets to lists
language_topic_words = {
    language: list(words)
    for language, words in language_topic_words.items()
}

In [16]:
def get_embeddings(words, language, model, batch_size=200):
    embeddings = []
    for batch in tqdm.tqdm(
        range(0, len(words), batch_size),
        desc="Embeddings",
        total=len(words) // batch_size + 1
    ):
        batch_embeddings = model.predict(words[batch:batch + batch_size], source_lang=f"{language}_Latn")
        embeddings.extend(batch_embeddings)
    return embeddings

language_topic_word_embeddings = {}

for language, words in tqdm.tqdm(language_topic_words.items(), desc="Languages"):
    if language == "ven":
        embedding_language = "sna"
    else:
        embedding_language = language
    embeddings = get_embeddings(words, embedding_language, t2vec_model)

    for word, embedding in zip(words, embeddings):
        language_topic_word_embeddings[(language, word)] = embedding
    

Embeddings: 100%|██████████| 5/5 [00:48<00:00,  9.63s/it]
Embeddings: 100%|██████████| 4/4 [00:35<00:00,  8.95s/it]
Embeddings: 100%|██████████| 4/4 [00:31<00:00,  7.98s/it]
Embeddings: 100%|██████████| 3/3 [00:22<00:00,  7.40s/it]
Embeddings: 100%|██████████| 4/4 [00:35<00:00,  8.94s/it]
Embeddings: 100%|██████████| 5/5 [00:40<00:00,  8.19s/it]
Embeddings: 100%|██████████| 3/3 [00:26<00:00,  8.82s/it]
Embeddings: 100%|██████████| 4/4 [00:33<00:00,  8.44s/it]
Embeddings: 100%|██████████| 2/2 [00:19<00:00,  9.76s/it]
Languages: 100%|██████████| 9/9 [04:54<00:00, 32.72s/it]


In [17]:
# convert tensors to lists
language_topic_word_embeddings = {
    key: value.tolist()
    for key, value in language_topic_word_embeddings.items()
}

In [18]:
# make dataframe
topic_words_df = pd.DataFrame([
    {
        "language": language,
        "word": word,
        "embedding": embedding
    }
    for (language, word), embedding in language_topic_word_embeddings.items()
])

In [19]:
topic_words_df.head()

Unnamed: 0,language,word,embedding
0,zul,emva,"[0.0037508714012801647, 0.002158257644623518, ..."
1,zul,kubo,"[0.006009371485561132, 0.002523238305002451, -..."
2,zul,ibe,"[0.007078166585415602, 0.007814861834049225, -..."
3,zul,ukwelashwa,"[0.0038973756600171328, 0.00260126288048923, -..."
4,zul,ngaphezulu,"[0.002381658647209406, -0.0013715632958337665,..."


In [20]:
# save to parquet
topic_words_df.to_parquet(f"{data_directory}/topic_word_vectors.parquet")

## Calculate score

In [13]:
# load df
topic_words_df = pd.read_parquet(f"{data_directory}/topic_word_vectors.parquet")

In [14]:
# get word vectors by grouping over words and taking the mean of the embeddings
topic_word_vectors_lang_ind = topic_words_df.groupby("word")["embedding"].apply(np.mean)

In [15]:
embedding_length = len(topic_word_vectors_lang_ind.iloc[0])

In [16]:
new_word_vectors = {}

def word_vector(word):
    if word in topic_word_vectors_lang_ind:
        embedding = topic_word_vectors_lang_ind[word]
        return embedding
        
    if word.isnumeric():
        return [0] * embedding_length
            
    if word in new_word_vectors:
        return new_word_vectors[word]
    

    new_word_vectors[word] = t2vec_model.predict([word], source_lang="zul_Latn")[0]
    return new_word_vectors[word]

def cosine_similarity(vec1, vec2):
    if np.linalg.norm(vec1) == 0 or np.linalg.norm(vec2) == 0:
        return 0
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def topic_similarity(topic):
    similarities = [
        cosine_similarity(word_vector(word1), word_vector(word2))
        for word1, word2 in itertools.combinations(topic, 2)
    ]
    score = np.mean(similarities)
    return score


def topics_similarity(topics):
    return np.mean([
        topic_similarity(topic)
        for topic in topics
    ])


# Multilinguality scoring

In [17]:
def topic_multilinguality(topic):
    languages = set()
    for word in topic:
        if word not in words_languages:
            continue
        word_langs = words_languages[word]
        languages.update(word_langs)

    value = (len(languages) - 1) / (N_LANGUAGES - 1) 
    return value

words_languages = {}
for word, language in topic_words_df[["word", "language"]].values:
    if word in words_languages:
        words_languages[word].add(language)
    else:
        words_languages[word] = {language}

def topics_multilinguality(topics):
    return np.mean([
        topic_multilinguality(topic)
        for topic in topics
    ])

# MTC scoring

In [18]:
def topic_mtc(topic):
    sim_score = topic_similarity(topic)
    mul_score = topic_multilinguality(topic)
    harmonic_mean = 2 * sim_score * mul_score / (sim_score + mul_score)
    return harmonic_mean

def topics_mtc(topics):
    return np.mean([
        topic_mtc(topic)
        for topic in topics
    ])

# Apply metrics

In [19]:
with open(f"{data_directory}/output.jsonl") as f:
    output = [json.loads(line) for line in f]

output_df = pd.DataFrame(output)

In [20]:
output_df.head()

Unnamed: 0,model,num_topics,topics,npmi,multilinguality,multilinguality_hard,topic_similarity,mtc_similarity,mtc_npmi,diversity
0,lda,50,"[[emva, nabo, zonke, lawo, izinto, izicelo, lo...",0.064995,0.8875,0.164644,0.824642,0.841352,0.121119,0.177191
1,lda,50,"[[nama, lawa, ngendlela, ngayo, bakhe, ebusuku...",0.06393,0.9125,0.167813,0.825161,0.85713,0.119488,0.176744
2,lda,50,"[[eka, no, c, kuchaza, umsebenzi, ngaso, eziny...",0.062965,0.8925,0.164812,0.819475,0.841835,0.11763,0.181844
3,lda,50,"[[la, bo, le, lakho, ukuqinisekisa, imithetho,...",0.061783,0.8775,0.159666,0.822795,0.834839,0.115438,0.178834
4,lda,50,"[[nama, south, of, africa, pretoria, ube, bakh...",0.063234,0.88,0.162014,0.823971,0.836761,0.11799,0.177611


## NPMI

In [34]:
output_df["npmi"] = output_df["topics"].apply(npmi_topics)

In [None]:
# select rows that have NaN in npmi and calculate npmi
for i, row in output_df[output_df["npmi"].isna()].iterrows():
    output_df.loc[i, "npmi"] = npmi_topics(row["topics"])

In [None]:
# apply to the topics
for output_obj in tqdm.tqdm(output, desc="output"):
    topic_similarity_score = topics_similarity(output_obj["topics"])
    output_obj["topic_similarity"] = topic_similarity_score

output: 100%|██████████| 150/150 [00:34<00:00,  4.32it/s]


## Raw multilinguality

In [36]:
output_df["multilinguality"] = output_df["topics"].apply(topics_multilinguality)

## Similarity score

In [37]:
output_df["topic_similarity"] = output_df["topics"].apply(topics_similarity)

## MTC

In [38]:
# calculate harmonic mean of multilinguality_easy and topic_similarity call it mtc_similarity
output_df["mtc_similarity"] = output_df["topics"].apply(topics_mtc)