In [150]:
import json
import pandas as pd
import pymorphy2
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim.models
from gensim.models import Word2Vec
from gensim.models import FastText
from gensim.models import KeyedVectors
import gensim.downloader as api
import zipfile
import sys
import requests, io
import re 
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Andrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Andrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [151]:
def preprocess(text:str, stop_words, punctuation_marks, morph):
    tokens = word_tokenize(text.lower())
    preprocessed_text = []
    for token in tokens:
        if token not in punctuation_marks:
            lemma = morph.parse(token)[0].normal_form
            if re.match(r'(\d.|\d)', lemma) is None:
                if lemma not in stop_words:
                    preprocessed_text.append(lemma)
    return preprocessed_text

def read_json(path: str):
    file = open(path)
    data = json.load(file)
    return pd.DataFrame(data)

punctuation_marks = ['!', ',', '(', ')', ';', ':', '-', '?', '.', '..', '...', "\"", "/", "\`\`", "»", "«" ]
stop_words = stopwords.words("russian")
morph = pymorphy2.MorphAnalyzer()

In [152]:
class ModelResearcher:
    def __init__(self):
        self.model = None
    
    def preprocess_and_save(self, data_df: pd.DataFrame, path, text_field='text') -> pd.DataFrame:
        # for preprocessing dataset. Use it only in critical cases cause it's too slow on big datasets
        data_df['preprocessed_' + text_field] = data_df.apply(lambda row: preprocess(row[text_field], punctuation_marks, stop_words, morph), axis=1)
        data_df_preprocessed = data_df.copy()
        data_df_preprocessed = data_df_preprocessed.drop(columns=[text_field], axis=1)
        data_df_preprocessed.reset_index(drop=True, inplace=True)
        if path is not None:
            data_df_preprocessed.to_json(path)
        return data_df_preprocessed
    
    def preprocess_and_save_pairs(self, data_df: pd.DataFrame, path, text_field_1, text_field_2) -> pd.DataFrame:
        data_df['preprocessed_' + text_field_1] = data_df.apply(lambda row: preprocess(row[text_field_1], punctuation_marks, stop_words, morph), axis=1)
        data_df['preprocessed_' + text_field_2] = data_df.apply(lambda row: preprocess(row[text_field_2], punctuation_marks, stop_words, morph), axis=1)
        data_df_preprocessed = data_df.copy()
        data_df_preprocessed = data_df_preprocessed.drop(columns=[text_field_1, text_field_2], axis=1)
        data_df_preprocessed.reset_index(drop=True, inplace=True)
        if path is not None:
            data_df_preprocessed.to_json(path)
        return data_df_preprocessed
    
    def train(self, data_df: pd.DataFrame, model="w2v"):            
        if model == "w2v":
            train_part = data_df['preprocessed_texts']
            self.model = gensim.models.Word2Vec(sentences=train_part, min_count=5, vector_size=50, epochs=5)
        elif model == "fast_text":
            print("fast_text")
            train_part = data_df['preprocessed_texts'].tolist()
            self.model = gensim.models.FastText(vector_size=50, min_count=5)
            self.model.build_vocab(corpus_iterable=train_part)
            self.model.train(corpus_iterable=train_part, total_examples=len(train_part), epochs=5)
        return
    
    def predict_sentences_similarity(self, sentences_1: pd.Series, sentences_2: pd.Series):
        if sentences_1.size != sentences_2.size:
            return None
        else:
            if self.model is not None:
                sentences_sim = np.zeros(sentences_1.size)
                sz = sentences_1.size
#                 print(f'index_to_key: {self.model.wv.index_to_key}')
                for i in range(sz): 
                    sentences_1_words = [w for w in sentences_1[i] if w in self.model.wv.index_to_key]
                    sentences_2_words = [w for w in sentences_2[i] if w in self.model.wv.index_to_key]
                    sim = self.model.wv.n_similarity(sentences_1_words, sentences_2_words)
                    sentences_sim[i] = sim
                
                return sentences_sim
            else:
                return None
            
   
        return round(float(2*TP / (2*TP + FP + FN)), 3)

In [153]:
 def calc_f1_score(sim, df, match_threshold):
    (TP, FP, FN, TN) = (0, 0, 0, 0)
    for i in range(sim.size):
        if df['need_match'][i]:
            if sim[i] >= match_threshold: 
                TP += 1
            else:
                FN += 1
        else:
            if sim[i] >= match_threshold: 
                FP += 1
            else:
                TN += 1
       
    return round(float(2*TP / (2*TP + FP + FN)), 3)

## Обучим word2vec

In [154]:
data_df = read_json('./preprocessed_documents.json')
modelResearcher_w2v = ModelResearcher()
modelResearcher_w2v.train(data_df, model="w2v")

In [155]:
match_unmatch_df = read_json('./dataset.json')

In [156]:
df_train_f1 = pd.concat([pd.DataFrame(match_unmatch_df[0:17]), pd.DataFrame(match_unmatch_df[30:])], axis=0)
df_test_f1 = pd.DataFrame(match_unmatch_df[17:30])

In [157]:
print('df_train: {} of {}'.format(df_train_f1['id_rp'].size, match_unmatch_df ['id_rp'].size))
print('df_test: {} of {}'.format(df_test_f1['id_rp'].size,match_unmatch_df['id_rp'].size))

df_train: 33 of 46
df_test: 13 of 46


In [158]:
df_train_f1 = modelResearcher_w2v.preprocess_and_save_pairs(df_train_f1, None, 'text_rp', 'text_proj')
df_test_f1 = modelResearcher_w2v.preprocess_and_save_pairs(df_test_f1, None, 'text_rp', 'text_proj')

In [159]:
result_w2v_train = modelResearcher_w2v.predict_sentences_similarity(df_train_f1['preprocessed_text_rp'], df_train_f1['preprocessed_text_proj'])

In [162]:
f1_w2v_train = calc_f1_score(result_w2v_train, df_train_f1, 0.78)
print('F1-score train w2v: {}'.format(f1_w2v_train ))
df_train_f1.drop('score', inplace=True, axis=1, errors='ignore')
df_train_f1.insert(loc=4, column='score', value=result_w2v) 
# df_train_f1

F1-score train w2v: 0.733


In [163]:
result_w2v_test = modelResearcher_w2v.predict_sentences_similarity(df_test_f1['preprocessed_text_rp'], df_test_f1['preprocessed_text_proj'])

In [164]:
f1_w2v_test = calc_f1_score(result_w2v_test, df_test_f1, 0.78)
print('F1-score test w2v: {}'.format(f1_w2v_test))

F1-score test w2v: 0.727


In [165]:
df_test_f1.drop('score', inplace=True, axis=1, errors='ignore')
df_test_f1.insert(loc=4, column='score', value=result_w2v_test) 
# df_test_f1

## FastText

In [175]:
modelResearcher_ft = ModelResearcher()
data_df = read_json('./preprocessed_documents.json')
modelResearcher_ft.train(data_df, model="fast_text")

In [177]:
result_ft_train = modelResearcher_ft.predict_sentences_similarity(df_train_f1['preprocessed_text_rp'], df_train_f1['preprocessed_text_proj'])
f1_ft_train = calc_f1_score(result_ft_train, df_train_f1, 0.78)
print('F1-score train ft: {}'.format(f1_ft_train ))

result_ft_test = modelResearcher_ft.predict_sentences_similarity(df_test_f1['preprocessed_text_rp'], df_test_f1['preprocessed_text_proj'])
f1_ft_test = calc_f1_score(result_ft_test, df_test_f1, 0.78)
print('F1-score test ft: {}'.format(f1_ft_test ))

F1-score train ft: 0.667
F1-score test ft: 0.833
