In [1]:
import numpy as np ## pip install numpy
import faiss ## conda install -c conda-forge faiss-gpu
import torch ## conda install pytorch torchvision torchaudio pytorch-cuda=11.6 -c pytorch -c nvidia (Select the right CUDA version for your GPU).
import torch.nn as nn
from sentence_transformers import SentenceTransformer ## pip install -U sentence-transformers

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load sentence encoder model. Other models can be found here: https://huggingface.co/sentence-transformers.
# The first time you load a sentence encoder, it will be downloaded and saved in the .chace folder for future reuse. 
transformer_name = "sentence-transformers/LaBSE"
sentence_encoder = SentenceTransformer(transformer_name).to("cuda")

In [21]:
# Create random source and target sentences. (Punctuation matters a lot unfortunately).
sentence = "Today is a beautiful day!"
wordnet_sentences = [
    "Today is a good day.", 
    "Yesterday was a good day.",
    "Today is a good day!", ## This should be the best according to meaning and punctuation.
    "Yesterday was a good day!", 
    "Today is a bad day.", 
    "Today is a bad day!"]

In [22]:
# Create list of source and target embeddings through the sentence encoder.
sources_emb_list = []
targets_emb_list = []
source_emb = sentence_encoder.encode(sentence)
target_embs = sentence_encoder.encode(wordnet_sentences)
sources_emb_list.append(source_emb)
targets_emb_list.extend(target_embs)

In [23]:
# Convert everything into a matrix as required by FAISS.
sources_emb_matrix = np.array(sources_emb_list, dtype=np.float32)
targets_emb_matrix = np.array(targets_emb_list, dtype=np.float32)
sources_emb_matrix.shape, targets_emb_matrix.shape

((1, 768), (6, 768))

In [30]:
k = 3 # Number of candidates.
# Now we build the FAISS index to load source and target matrices.
index = faiss.index_factory(sources_emb_matrix.shape[1], "Flat", faiss.METRIC_INNER_PRODUCT) # METRIC_INNER_PRODUCT = Similarity metric between vectors.
faiss.normalize_L2(sources_emb_matrix) # Normalize data for efficiency.
faiss.normalize_L2(targets_emb_matrix) # Normalize data for efficiency.
index.add(targets_emb_matrix) 

D, I = index.search(sources_emb_matrix[0:1], k) # Search for the K most similar sentences among the wordnet_sentences.
# D = array of the distances between source and best K candidates (based on the chosen similarity metric above).
# I = array of the indexes of the best K candidates (based on the chosen similarity metric above).
D, I

(array([[0.95209295, 0.88073856, 0.7944416 ]], dtype=float32),
 array([[2, 5, 3]]))

In [31]:
# NOTE: FAISS, in general, is developed to do this procedure for N source sentences, therefore it will return D and I as matrices. 
# Since we have only one source sentence, we need to access the first (0) element of D and I.
D[0], I[0]

(array([0.95209295, 0.88073856, 0.7944416 ], dtype=float32), array([2, 5, 3]))

In [34]:
# Now let's retrieve the best candidates (already ordered by similarity).
best_candidates = [wordnet_sentences[i] for i in I[0]]
best_candidates

['Today is a good day!', 'Today is a bad day!', 'Yesterday was a good day!']

In [10]:
import jsonlines
import json
import os

train_dir = "../../../data-without-embeddings/opus/books/train/"
val_dir = "../../../data-without-embeddings/opus/books/val/"
test_dir = "../../../data-without-embeddings/opus/books/test/"
train_dict = {}
val_dict = {}
test_dict = {}
for file in os.listdir(train_dir):
    with jsonlines.open(train_dir + file) as reader:
        for obj in reader:
            sl = len(obj["sources"]["ids"]) if obj["sources"]["ids"] != [""] else 0
            tl = len(obj["targets"]["ids"]) if obj["targets"]["ids"] != [""] else 0
            if (sl, tl) not in train_dict:
                train_dict[(sl, tl)] = 1
            else:
                train_dict[(sl, tl)] += 1
for file in os.listdir(val_dir):
    with jsonlines.open(val_dir + file) as reader:
        for obj in reader:
            sl = len(obj["sources"]["ids"]) if obj["sources"]["ids"] != [""] else 0
            tl = len(obj["targets"]["ids"]) if obj["targets"]["ids"] != [""] else 0
            if (sl, tl) not in val_dict:
                val_dict[(sl, tl)] = 1
            else:
                val_dict[(sl, tl)] += 1
for file in os.listdir(test_dir):
    with jsonlines.open(test_dir + file) as reader:
        for obj in reader:
            sl = len(obj["sources"]["ids"]) if obj["sources"]["ids"] != [""] else 0
            tl = len(obj["targets"]["ids"]) if obj["targets"]["ids"] != [""] else 0
            if (sl, tl) not in test_dict:
                test_dict[(sl, tl)] = 1
            else:
                test_dict[(sl, tl)] += 1

In [14]:
train_dict = dict(sorted(train_dict.items(), key=lambda item: item[1], reverse=True))
val_dict = dict(sorted(val_dict.items(), key=lambda item: item[1], reverse=True))
test_dict = dict(sorted(test_dict.items(), key=lambda item: item[1], reverse=True))

In [34]:
import csv
with open("test_dict.tsv", "w") as fout:
    writer = csv.writer(fout, delimiter="\t")
    writer.writerow(["comb", "count"])
    for k,v in test_dict.items():
        writer.writerow([k, v])

In [25]:
train_dir = "../../../data-without-embeddings/opus/books/train/"
val_dir = "../../../data-without-embeddings/opus/books/val/"
test_dir = "../../../data-without-embeddings/opus/books/test/"
train_len = {}
val_len = {}
test_len = {}
for file in os.listdir(train_dir):
    source_lang = file[6:8]
    target_lang = file[9:11]
    with jsonlines.open(train_dir + file) as reader:
        for obj in reader:
            sl = len(obj["sources"]["ids"]) if obj["sources"]["ids"] != [""] else 0
            tl = len(obj["targets"]["ids"]) if obj["targets"]["ids"] != [""] else 0
            if (source_lang, target_lang) not in train_len:
                train_len[(source_lang, target_lang)] = [sl, tl]
            else:
                train_len[(source_lang, target_lang)][0] += sl
                train_len[(source_lang, target_lang)][1] += tl
for file in os.listdir(val_dir):
    source_lang = file[4:6]
    target_lang = file[7:9]
    with jsonlines.open(val_dir + file) as reader:
        for obj in reader:
            sl = len(obj["sources"]["ids"]) if obj["sources"]["ids"] != [""] else 0
            tl = len(obj["targets"]["ids"]) if obj["targets"]["ids"] != [""] else 0
            if (source_lang, target_lang) not in val_len:
                val_len[(source_lang, target_lang)] = [sl, tl]
            else:
                val_len[(source_lang, target_lang)][0] += sl
                val_len[(source_lang, target_lang)][1] += tl
for file in os.listdir(test_dir):
    source_lang = file[5:7]
    target_lang = file[8:10]
    with jsonlines.open(test_dir + file) as reader:
        for obj in reader:
            sl = len(obj["sources"]["ids"]) if obj["sources"]["ids"] != [""] else 0
            tl = len(obj["targets"]["ids"]) if obj["targets"]["ids"] != [""] else 0
            if (source_lang, target_lang) not in test_len:
                test_len[(source_lang, target_lang)] = [sl, tl]
            else:
                test_len[(source_lang, target_lang)][0] += sl
                test_len[(source_lang, target_lang)][1] += tl

In [26]:
train_len = dict(sorted(train_len.items(), key=lambda item: item[1][0], reverse=True))
val_len = dict(sorted(val_len.items(), key=lambda item: item[1][0], reverse=True))
test_len = dict(sorted(test_len.items(), key=lambda item: item[1][0], reverse=True))

In [31]:
import csv
with open("test_len.tsv", "w") as fout:
    writer = csv.writer(fout, delimiter="\t")
    writer.writerow(["source", "target", "source_len", "target_len"])
    for k,v in test_len.items():
        writer.writerow([k[0], k[1], v[0], v[1]])