# Семантический поиск с применением векторизации слов в помощь переводчику

# Тренировка word2vec

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
import string
import spacy
from spacy import displacy
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.lang.en import English


nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))

# функция для разбиения текста на предложения
def split_in_sentences(text):
    doc = nlp(text)
    return [str(sent).strip() for sent in doc.sents]


# функция для частичной (т. к. исходный текст очень большой) предобработки текста,
# заключающейся в удалении повторяющихся названий разделов и нумерации и 
# склеивании/разделении неправильно разделенных предложений
def preprocess(line):
  processed = []
  sent = []
  line = re.sub(r'(?<=\W)\”', '', re.sub(r'^\“', '', line))
  line = re.sub(r'^\d*?\.\s*(?=[A-Za-z])', '', line)
  line = re.sub(r'^\d*?\.\d*?\.\s*(?=[A-Z])', '', line)
  line = re.sub(r'^[A-Z]*?\.\s*(?=[A-Z])', '', line)
  line = re.sub(r'^\([a-z]\)\s*(?=[a-zA-Z])', '', line)
  line = re.sub(r'^\d*\([a-z]\)\s*(?=[a-zA-Z])', '', line)
  line = re.sub(r'^<...>\s*', '', line)
  line = line.strip('\n')
  if (line == 'THE CIRCUMSTANCES OF THE CASE' or line == 'PROCEDURE' or 
    line == 'THE FACTS' or line == 'RELEVANT DOMESTIC LAW AND PRACTICE' or 
    line == 'THE LAW' or line == 'Admissibility' or line == 'Merits' or 
    line == 'APPLICATION OF ARTICLE 41 OF THE CONVENTION' or 
    line == 'Damage' or line == 'Costs and expenses' or 
    line == 'FOR THESE REASONS, THE COURT UNANIMOUSLY' or 
    line == '' or 
    all(j.isdigit() or j in string.punctuation for j in line)):
    return []

  sent = split_in_sentences(line)
  corrected = []
  for i in sent:
    if i == '\\n':
      continue
    elif re.findall(r'\w\.\s(?=An|The|On|In)', i):
      i = re.split(r'(?<=[.])\s*(?=[A-Za-z])', i)
      for sen in i:
        corrected.append(sen)
    else:
      corrected.append(i)

  previous_cor = ''
  for cor in corrected:
    if len(cor) <= 4:
      continue
    if (previous_cor != '' and cor != '\n' and ((previous_cor[-1] == ',') 
    or (previous_cor[-1] == '.' and (cor[0].islower())) 
    or (previous_cor[-4:] == ' no.' and cor[0].isdigit()) 
    or (previous_cor[-4:] == ' nos.' and cor[0].isdigit()) 
    or (previous_cor[-4:] == '(no.' and cor[0].isdigit()) 
    or (previous_cor[-4:] == '(nos.' and cor[0].isdigit()) 
    or (previous_cor[-5:] == ' nos.' and cor[0].isdigit())
    or (previous_cor[-4:] == ' No.' and cor[0].isdigit()) 
    or (previous_cor[-4:] == ' Nos.' and cor[0].isdigit())
    or (previous_cor[-4:] == ' NO.' and cor[0].isdigit()) 
    or (previous_cor[-4:] == ' NOS.' and cor[0].isdigit()) 
    or (previous_cor[-3:] == ' p.' and cor[0].isdigit()) 
    or (previous_cor[-4:] == ' pp.' and cor[0].isdigit()) 
    or (previous_cor[-3:] == ' v.') 
    or (previous_cor[-1] == '§') 
    or (previous_cor[-4:] == 'Cap.') 
    or (previous_cor[-5:] == ' Doc.') 
    or (previous_cor[-1] == '['))):
      res = previous_cor + ' ' + cor
      processed[len(processed)-1] = res
      previous_cor = res
    else:
      processed.append(cor)
      previous_cor = cor
  return processed


In [None]:
# считываем файл в формате json из датасета и записываем предложения построчно в
# новый файл в текстовом формате
with open('/content/drive/MyDrive/cases.json',) as f:
  with open('preprocessed.txt', 'a') as record_file:
    while True:
      line = f.readline()
      if not line:
        break
      line = line.strip()

      if line[:12] == '"content": "' and line[-2:] == '",':
        json_line = '{' + line[:-1] + '}'
        json_line_data = json.loads(json_line)
        preproc_list = preprocess(json_line_data['content'])
        for el in preproc_list:
          record_file.write(el + '\n')

In [None]:
import nltk
import json

# nltk.download('punkt')
from nltk.tokenize import word_tokenize

# считываем предложения из файла preprocessed.txt, токенизируем их и записываем
# их в файл tokenized, чтобы не хранить в оперативной памяти  
with open('/content/drive/MyDrive/preprocessed.txt') as file:
  sentences = file.readlines()

tokenized = []
for sent in sentences:
  sent = sent.rstrip('\n')
  tokenized.append(word_tokenize(sent))

with open('tokenized.txt', 'a') as tok:
  for i in tokenized:
    i = json.dumps(i)
    tok.write(i + '\n')


In [2]:
import json
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader

# считываем предложения из файла tokenized.txt и подаем их модели для 
# построения векторов
tokens = []
with open('/content/drive/MyDrive/tokenized.txt', 'r') as tok:
  while True:
      sent = tok.readline()
      if not sent:
        break
      sent = json.loads(sent)
      tokens.append(sent)

# обучаем модель и сохраняем ее
model = Word2Vec(sentences=tokens, size=300, window=5, min_count=2, workers=10, iter=2, sg=1, negative=1)
model.save("word2vec.model")


In [4]:
# сохраняем полученные вектора
word_vectors = model.wv
word_vectors.save('word2vec.kv')

In [5]:
from gensim.models import KeyedVectors


# загружаем вектора обученной модели
model_vectors = KeyedVectors.load('word2vec.kv')

In [6]:
import numpy as np
from scipy import spatial
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
import json


index2word_set = set(model_vectors.index2word)

# функция для вычисления среднего вектора предложения
def avg_feature_vector(words, model, num_features, index2word_set):
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

# т. к. векторы всех предложений не удалось сохранить/правильно считать в виде 
# numpy массива по неизвестной причине, то для демонстрации работы программы 
# взят и векторизован объемный текст постановления 
# "CASE OF KHODORKOVSKIY AND LEBEDEV v. RUSSIA", входящего в состав исходного 
# датасета
tokenized_sens = []
with open('/content/drive/MyDrive/khodor_tokenized.txt', 'r') as tok:
  while True:
      sent = tok.readline()
      if not sent:
        break
      sent = json.loads(sent)
      tokenized_sens.append(sent)

text_processed = []
with open('/content/drive/MyDrive/khodor_processed.txt', 'r') as processed:
   text_processed = processed.readlines()

# вводится искомая фраза, переведенная на английский язык с помощью машинного 
# перевода, после чего выдаются три наиболее близких результата поиска,
# полученыые путем сравнения косинусного сходства усредненных векторов предложений
query = "a fake deal" 
query = word_tokenize(query)


sim_list = []
s1_afv = avg_feature_vector(query, model=model_vectors, num_features=300, index2word_set=index2word_set)


i = 0
for num, sent in enumerate(tokenized_sens):
  emb = avg_feature_vector(sent, model=model_vectors, num_features=300, index2word_set=index2word_set)
  sim = 1 - spatial.distance.cosine(s1_afv, emb)
  if sim > 0.7:
    result = sim, text_processed[i]
    sim_list.append(result)
  i += 1

best_result = sorted(sim_list, reverse=True)[:3]
for res in best_result:
  print(res[0])
  print(res[1])



0.7079602479934692
The merger was supposed to take place in two steps: firstly, completion of the deal on paper, and then unification of the new company’s management structures.

0.7048969268798828
A person cannot enter into a “sham” transaction by inadvertence; it is always a deliberate act.

0.7037367820739746
Even if it was true, it was perfectly normal for a businessman, for a number of reasons, to trade with the end-users not directly but through a corporate intermediary established by him.



Сравнение результатов работы модели word2vec-google-news-300 по тому же запросу (с предложениями, найденными моделью, обученной только на корпусе судебных текстов)

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader


# для сравнения была использована модель, обученная на датасете Google
google_model = gensim.downloader.load('word2vec-google-news-300')

# word_vectors = google_model.wv
google_model.wv.save("google2vec.wordvectors")

# wv = KeyedVectors.load("google2vec.wordvectors", mmap='r')
# google_model = wv

  # Remove the CWD from sys.path while we load stuff.


In [None]:
google_model = KeyedVectors.load("google2vec.wordvectors", mmap='r')

In [None]:
import numpy as np
from scipy import spatial
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import json

query = "fake deal"
query = word_tokenize(query)

index2word_set = set(google_model.index2word)
# функция для вычисления среднего вектора предложения
def avg_feature_vector(words, model, num_features, index2word_set):
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

sentences = ['Even if it was true, it was perfectly normal for a businessman, for a number of reasons, to trade with the end-users not directly but through a corporate intermediary established by him.', 
             'A person cannot enter into a “sham” transaction by inadvertence; it is always a deliberate act.', 
             'The merger was supposed to take place in two steps: firstly, completion of the deal on paper, and then unification of the new company’s management structures.']
sim_list = []
s1_afv = avg_feature_vector(query, model=google_model, num_features=300, index2word_set=index2word_set)

for sent in sentences:
  s2_afv = avg_feature_vector(sent, model=google_model, num_features=300, index2word_set=index2word_set)
  sim = 1 - spatial.distance.cosine(s1_afv, s2_afv)
  print(sim, sent)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
0.0737365260720253 Even if it was true, it was perfectly normal for a businessman, for a number of reasons, to trade with the end-users not directly but through a corporate intermediary established by him.
0.06405121833086014 A person cannot enter into a “sham” transaction by inadvertence; it is always a deliberate act.
0.0636746808886528 The merger was supposed to take place in two steps: firstly, completion of the deal on paper, and then unification of the new company’s management structures.


Вывод: числовые показатели косинусного сходства у модели, обученной на судебных текстах, гораздо выше, чем у модели Google, обученной на наборе новостей.