# Leipzig Corpora Collection

In [16]:
import collections
import glob
import json

import pandas  as pd
import sklearn.feature_extraction.text
import tqdm

data_dir = "data/"

In [7]:
# find sentences.txt in all directories
sentence_files = glob.glob(data_dir + "af_lang/*/*sentences.txt")

data = []

for sentence_file in sentence_files:
    with open(sentence_file, "r") as f:
        for line in f:
            _, text = line.split("\t")
            data.append({
                "text": text,
                "language": sentence_file.split("/")[-2].split("-")[0]
            })

len(data)

90000

In [8]:
data_by_lang = {
    lang: [d for d in data if d["language"] == lang]
    for lang in lang_counts.keys()
}

len(data_by_lang)

9

In [9]:
import sonar.inference_pipelines.text

t2vec_model = sonar.inference_pipelines.text.TextToEmbeddingModelPipeline(
    encoder="text_sonar_basic_encoder",
    tokenizer="text_sonar_basic_encoder"
)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# get embeddings
for lang, lang_data in data_by_lang.items():
    sentences = [d["text"] for d in lang_data]
    if "embedding" in lang_data[0]:
        continue

    if lang == "ven":
        lang = "sna"
    batch_size = 128
    embeddings = []
    for i in tqdm.tqdm(range(0, len(sentences), batch_size), desc=f"Embedding {lang}"):
        embeddings += t2vec_model.predict(sentences[i:i+batch_size], source_lang=f"{lang}_Latn")
    for d, emb in zip(lang_data, embeddings):
        d["embedding"] = emb.tolist()

In [12]:
# write data to parquet
df = pd.DataFrame(data)
df.to_parquet(f"{data_dir}/data.parquet")

# Get BoW representation

In [10]:
data = pd.read_parquet(f"{data_dir}/data.parquet").to_dict(orient="records")

In [13]:
languages = set(d["language"] for d in data)

# get word frequencies by language
word_freqs_by_lang = {
    lang: collections.Counter()
    for lang in languages
}

def get_clean_words(text):
    # remove punctuation
    text = "".join([c for c in text if c.isalnum() or c.isspace()])
    words = text.lower().split()
    # remove words that are just numbers
    words = [word for word in words if not word.isnumeric()]
    return words

for d in data:
    lang = d["language"]

    words = get_clean_words(d["text"])
    
    word_freqs_by_lang[lang].update(words)

# get common words by language
common_words_by_lang = {
    lang: {
        word: freq
        for word, freq in word_freqs.items()
        # if freq < 1000
    }
    for lang, word_freqs in word_freqs_by_lang.items()
}

# sorted by frequency
common_words_by_lang = {
    lang: dict(sorted(word_freqs.items(), key=lambda x: -x[1]))
    for lang, word_freqs in common_words_by_lang.items()
}

# get 600 most common words by language
common_words_by_lang = {
    lang: dict(list(word_freqs.items())[:600])
    for lang, word_freqs in common_words_by_lang.items()
}

In [15]:
# construct vocabulary
vocabulary = dict()
for lang, words in common_words_by_lang.items():
    for word in words:
        if word not in vocabulary:
            vocabulary[word] = {
                "idx": len(vocabulary),
                "languages": []
            }

        vocabulary[word]["languages"].append(lang)


# save vocabulary
with open(f"{data_dir}/vocabulary.json", "w") as f:
    json.dump(vocabulary, f)

len(vocabulary)

4412

In [33]:
# construct bag of words vector for each document
for d in tqdm.tqdm(data):
    words = get_clean_words(d["text"])
    bow = collections.Counter(words)
    d["bow"] = [bow[word] for word in vocabulary]

100%|██████████| 90000/90000 [01:03<00:00, 1411.22it/s]


In [35]:
# add cleaned words to data without stop words
for d in data:
    clean_words = get_clean_words(d["text"])
    d["clean_words"] = clean_words

In [None]:
df.to_parquet(f"{data_dir}/data.parquet", chunk_size=30000)

# TF-IDF vecs

In [None]:

data = pd.read_parquet(f"{data_dir}/data.parquet", chunk_size=30000).to_dict(orient="records")

transformer = sklearn.feature_extraction.text.TfidfTransformer()
X = [d["bow"] for d in data]
X_tfidf = transformer.fit_transform(X)

# save tfidf vectors in data
for i in range(len(data)):
    data[i]["tfidf"] = X_tfidf[counter].toarray().tolist()[0]

df = pd.DataFrame(data)


df.to_parquet(f"{data_dir}/data.parquet", chunk_size=30000)