In [1]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading hu

In [2]:
import time
import configparser
from itertools import islice
import pickle
# import numpy as np  # numpy + sentence_transformers = segmentation fault
# import cProfile

from sentence_transformers import SentenceTransformer, util

In [4]:
def main(loaded_model, source_path_en, source_path_ru, target_path_en, target_path_ru):
    """Run the script."""
    iteration = 0
    with open(source_path_en, 'r', encoding='utf-8') as in_f_en, \
       open(source_path_ru, 'r', encoding='utf-8') as in_f_ru, \
        open(target_path_en, 'wb') as to_f_en, \
            open(target_path_ru, 'wb') as to_f_ru:
        while True:
            print(f'------ Iteration {iteration} ---------')
            batch_size = 5000
            next_n_en = list(islice(in_f_en, batch_size))
            next_n_ru = list(islice(in_f_ru, batch_size))
            if not next_n_en:
                break
            emb_en = loaded_model.encode(next_n_en)
            emb_ru = loaded_model.encode(next_n_ru)
            iteration += 1

            pickle.dump(emb_en, to_f_en, protocol=pickle.HIGHEST_PROTOCOL)
            pickle.dump(emb_ru, to_f_ru, protocol=pickle.HIGHEST_PROTOCOL)
    
    print(f'Read {source_path_en}')
    print(f'Read {source_path_ru}')
    print(f'  Wrote {target_path_en}')
    print(f'  Wrote {target_path_ru}')


if __name__ == '__main__':
    bitext_model = 'LaBSE'
    loaded_model = SentenceTransformer(bitext_model)  #, device='mps')

    config = configparser.ConfigParser()
    config.read('config.ini')

    test = True
    if test:
        source_path_en = config['LaBSE-TEST-VECTORIZE']['source_file_en']
        source_path_ru = config['LaBSE-TEST-VECTORIZE']['source_file_ru']
        target_path_en = config['LaBSE-TEST-VECTORIZE']['target_file_en']
        target_path_ru = config['LaBSE-TEST-VECTORIZE']['target_file_ru']
    else:
        pass

    start_time = time.time()
    print(f'Start time: {time.strftime("%b %d %Y %H:%M:%S", time.gmtime(start_time))}')
    # cProfile.run('main()')
    main(loaded_model, source_path_en, source_path_ru, target_path_en, target_path_ru)

    print('-' * 20)
    print(f'Total time: {(time.time() - start_time)/60:.2f} minutes')
    print('-' * 20)

# Start time: Apr 19 2023 19:22:14
# ------ Iteration 0 ---------
# ------ Iteration 1 ---------
# Read data/5K_en.jaro_lines.txt
# Read data/5K_ru.jaro_lines.txt
#   Wrote data/5K_en.labse_emb.pkl
#   Wrote data/5K_ru.labse_emb.pkl
# --------------------
# Total time: 0.65 minutes

Start time: Apr 19 2023 19:22:14
------ Iteration 0 ---------
------ Iteration 1 ---------
Read data/5K_en.jaro_lines.txt
Read data/5K_ru.jaro_lines.txt
  Wrote data/5K_en.labse_emb.pkl
  Wrote data/5K_ru.labse_emb.pkl
--------------------
Total time: 0.65 minutes
--------------------


In [5]:
import time
import configparser
import pickle
import multiprocessing

from sentence_transformers import util


def cos_sim_func(data) -> list:
    """Return cosine similarities of vectors from two lists."""
    en_vectors, other_vectors = data[0], data[1]
    cos_list = []
    for en, other in zip(en_vectors, other_vectors):
        cos_list.append(util.cos_sim(en, other).item())
    return cos_list

# def flatten(lst) -> list:
#     """Flatten a list of lists."""
#     return [item for sublist in lst for item in sublist]


# def get_batch(seq, size=100):
#     """Split an array into subarrays.""" 
#     return [seq[i:i+size] for i in range(0, len(seq), size)]


def main(source_path_en, source_path_ru, target_path):
    """Run the script."""
    cos_similarities = []
    iteration = 0

    with open(source_path_en, 'rb') as en_f, \
       open(source_path_ru, 'rb') as ru_f:
        while True:
            print(f'------ Iteration {iteration} ---------')
            try:
                next_n_en_emb = pickle.load(en_f)
                next_n_ru_emb = pickle.load(ru_f)
            except EOFError:
                break
            # with multiprocessing.Pool() as pool:
                # temp_cos_sims = pool.map(cos_sim_func, list(zip(next_n_en_emb, next_n_ru_emb)))
            temp_cos_sims = cos_sim_func((next_n_en_emb, next_n_ru_emb))
            cos_similarities.extend(temp_cos_sims)
            iteration += 1
    
    print(f'Read {source_path_en}')
    print(f'Read {source_path_ru}')

    with open(target_path, 'w', encoding='utf-8') as to_f:
        for i in cos_similarities:
            to_f.write(str(i))
            to_f.write('\n')

    print(f'Wrote cos similarities to {target_path}')
    print(f'Use diplom_choose_cos_sim_cutoff_val.ipynb to choose cutoff value')
    


if __name__ == '__main__':
    config = configparser.ConfigParser()
    config.read('config.ini')

    test = True
    if test:
        source_path_en = config['LaBSE-COS-SIM-TEST']['source_file_en']
        source_path_ru = config['LaBSE-COS-SIM-TEST']['source_file_ru']
        target_path = config['LaBSE-COS-SIM-TEST']['target_file']
    else:
        pass

    start_time = time.time()
    print(f'Start time: {time.strftime("%b %d %Y %H:%M:%S", time.gmtime(start_time))}')
    # cProfile.run('main()')
    main(source_path_en, source_path_ru, target_path)

    print('-' * 20)
    print(f'Total time: {(time.time() - start_time)/60:.2f} minutes')
    print('-' * 20)

# Start time: Apr 19 2023 19:23:01
# ------ Iteration 0 ---------
# ------ Iteration 1 ---------
# Read data/5K_en.labse_emb.pkl
# Read data/5K_ru.labse_emb.pkl
# Wrote cos similarities to data/5K.labse_cos_sim.txt
# Use diplom_choose_cos_sim_cutoff_val.ipynb to choose cutoff value
# --------------------
# Total time: 0.01 minutes
# --------------------

Start time: Apr 19 2023 19:23:01
------ Iteration 0 ---------
------ Iteration 1 ---------
Read data/5K_en.labse_emb.pkl
Read data/5K_ru.labse_emb.pkl
Wrote cos similarities to data/5K.labse_cos_sim.txt
Use diplom_choose_cos_sim_cutoff_val.ipynb to choose cutoff value
--------------------
Total time: 0.01 minutes
--------------------
