In [1]:
import os 
import re
import codecs
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

Берем книгу с сайта: https://studyenglishwords.com/book/Кентервильское-привидение/109?page=1 и скачиваем html **EN->RU**

In [2]:
MIN_SENTENCE_LEN = 3
LOW_SEM_TH, HIGH_SEM_TH = 0.7, 0.98
BOOK_NAME = "Martin-Iden"
PATH_TO_HTML = f"{BOOK_NAME}.html"
PATH_TO_OUTPUT_ENG_TXT = 'MartinIden_trans/test_eng.txt'
PATH_TO_OUTPUT_RUS_TXT = 'MartinIden_trans/test_rus.txt'
PATH_TO_TRANSLATED_ENG_TXT = 'MartinIden_trans/test_eng_translation.txt'
MULTILINGUAL_SEM_MODEL = 'distiluse-base-multilingual-cased-v2'
PATH_TO_OUTPUT_CSV = f'MartinIden_trans/translated_{BOOK_NAME}.csv'

In [3]:
def split_by_punk(string):
    string = string.replace('\n', ' ')
    sent_list = re.split('[.!?]', string)
    return sent_list

def parse_punk(string):
    string = re.sub(r"[^\w\d\'\-\s]+","",string)
    string = ' '.join(string.split())
    return string    

def write_to_file(file_name, content):
    with open(file_name, 'w') as f:
        for line in content:
            f.write(line)
    print(f"{len(content)} sentences were written to {file_name}")

In [4]:
def parse_html_to_txts(html, output_eng_file, output_rus_file, w_threshold):

    index, to_frame, rus_output, eng_output =0, [], [], []
    
    for chunk in html.split("id=")[1:]:
        eng_rus = chunk.split('</a>')[1].split('</td></tr><tr><td>')[0]
        eng, rus = eng_rus.split('</td><td>')
        eng_split, rus_split = split_by_punk(eng), split_by_punk(rus)
        
        # check that chunks have the same number of sentences 
        if len(eng_split) == len(rus_split):

            for eng_sent, rus_sent in zip(eng_split, rus_split):
                
                if (len(eng_sent.split()) > w_threshold) and (len(rus_sent.split()) > w_threshold):
                    parsed_eng = parse_punk(eng_sent)
                    parsed_rus = parse_punk(rus_sent)
                    
                    rus_output.append(f'{index} {parsed_rus}\n')
                    eng_output.append(f'{index} {parsed_eng}\n')

                    to_frame.append((index, rus_sent, eng_sent, parsed_rus, parsed_eng))
                    index+=1

    write_to_file(output_eng_file, eng_output)
    write_to_file(output_rus_file, rus_output)
        
    df = pd.DataFrame(to_frame, columns = ["index", "rus_sent_w_punc", "eng_sent_w_punc", "rus_sent_clean", "eng_sent_clean"])
    
    return df

In [5]:
def filter_trans_by_sem_sim(model_name, path_to_translated_eng, path_to_rus, path_to_ountput_csv, eng_text, low_th, high_th):

    with open(path_to_translated_eng, 'r') as file:
        trans = file.readlines()

    with open(path_to_rus, 'r') as file:
        rus = file.readlines()

    rus_corpus, trans_corpus = [], []

    for r, t in zip(trans, rus):
        _, r = re.sub("[.,\n]", "", r).split(' ', 1)
        _, t = re.sub("[.,\n]", "", t).split(' ', 1)
        rus_corpus.append(r)
        trans_corpus.append(t)
        
    print('Loading Semantic model')
    model = SentenceTransformer(model_name)
    print('Building embeddings for russian text (from someone translation)...')
    rus_embeddings = model.encode(rus_corpus)
    print('Building embeddings for russian text (from Yandex translation)...')
    trans_embeddings = model.encode(trans_corpus)
    print('Predict semantic scores...')
    scores = np.diag(cosine_similarity(rus_embeddings, trans_embeddings)).tolist()
    
    df_translated = pd.DataFrame(zip(scores, eng_text, rus_corpus, trans_corpus), columns = ['score','ENG', 'RUS_1', 'RUS_2'])
    df_translated = df_translated.loc[(df_translated.score > low_th) & (df_translated.score < high_th)].sort_values(by=["score"], ascending=False)
    df_translated.to_csv(path_to_ountput_csv, index=False)
    print(f"Result translations of {df_translated.shape[0]} were saved to {path_to_ountput_csv}")
    return df_translated

def print_examples(df, n_exmp, seed=1):
    for row in  df.sample(n=n_exmp, random_state=seed).to_numpy():
        print(f"Semantic score: {row[0]:.3f} \nENG: {row[1]}\nRUS_1: {row[2]} \nRUS_2: {row[3]}\n{'-'*150}")

### Parsing downloaded HTML into rus and eng txt

In [6]:
df_output = parse_html_to_txts(html=codecs.open(PATH_TO_HTML, 'r').read(),
                               output_eng_file=PATH_TO_OUTPUT_ENG_TXT, 
                               output_rus_file= PATH_TO_OUTPUT_RUS_TXT,
                               w_threshold=MIN_SENTENCE_LEN)

7427 sentences were written to MartinIden_trans/test_eng.txt
7427 sentences were written to MartinIden_trans/test_rus.txt


* Here should be translation of file from PATH_TO_OUTPUT_RUS_TXT to PATH_TO_TRANSLATED_ENG_TXT

###  Filtering translation based on semantic similarities of Russian references

In [7]:
df_translation = filter_trans_by_sem_sim(model_name=MULTILINGUAL_SEM_MODEL,
                                         path_to_translated_eng=PATH_TO_TRANSLATED_ENG_TXT, 
                                         path_to_rus=PATH_TO_OUTPUT_RUS_TXT,
                                         eng_text=df_output.eng_sent_clean.to_list(),
                                         path_to_ountput_csv=PATH_TO_OUTPUT_CSV, 
                                         low_th=LOW_SEM_TH, high_th=HIGH_SEM_TH)

Loading Semantic model
Building embeddings for russian text (from someone translation)...
Building embeddings for russian text (from Yandex translation)...
Predict semantic scores...
Result translations of 4877 were saved to MartinIden_trans/translated_Martin-Iden


In [8]:
print_examples(df_translation, 10, seed=123)

Semantic score: 0.854 
ENG: You do corporation work
RUS_1: Вы работаете в корпорации 
RUS_2: Вы работаете на акционерные компании
------------------------------------------------------------------------------------------------------------------------------------------------------
Semantic score: 0.731 
ENG: Waiting on the corner for a car she had seen him first and noted the eager hungry lines of his face and the desperate worried look of his eyes
RUS_1: Поджидая на углу машину она увидела его первой и заметила нетерпеливые голодные черты его лица и отчаянный обеспокоенный взгляд его глаз 
RUS_2: Гертруда ждала на углу трамвая и первая увидела брата заметила какое у него напряженное исхудалое лицо какое отчаяние и тревога в глазах
------------------------------------------------------------------------------------------------------------------------------------------------------
Semantic score: 0.809 
ENG: There was a peach from West Oakland
RUS_1: Там был персик из Западного Окленда 
