In [1]:
import pymorphy2
import nltk
import ujson as json
import matplotlib.pyplot as plt
import numpy as np
import itertools
import gzip
import operator

from gensim.corpora.dictionary import Dictionary
from gensim.models import word2vec
from scipy.spatial.distance import cosine
from datetime import datetime
from collections import Counter

morth_analyzer = pymorphy2.MorphAnalyzer()

In [2]:
def split_words_v3(a_text):
    cur_word = ''
    prev_is_alpha = False

    for letter in a_text:
        if  (letter.isalpha() and prev_is_alpha or 
            letter.isdigit() and not prev_is_alpha):
            cur_word += letter
        elif (letter.isalpha() and not prev_is_alpha or
             letter.isdigit() and prev_is_alpha):
            if cur_word: yield cur_word
            cur_word = letter
            prev_is_alpha = not prev_is_alpha
        else:
            if cur_word: yield cur_word
            cur_word = ''
            prev_is_alpha = False
    if cur_word: yield cur_word
        
MORTH_CACHE = {}
def get_norm_word_v3(a_word):
    if a_word not in MORTH_CACHE: MORTH_CACHE[a_word] = morth_analyzer.parse(a_word)[0].normal_form
    return MORTH_CACHE[a_word]

def get_doc_words(a_doc, a_split=split_words_v3, a_norm_word=get_norm_word_v3):
    for word in itertools.chain(a_split(a_doc.title), a_split(a_doc.description)):
        yield a_norm_word(word)
        
def get_doc_words_(a_doc, a_split=split_words_v3, a_norm_word=get_norm_word_v3):
    for word in itertools.chain(a_split(a_doc.title), a_split(a_doc.description)):
        yield a_norm_word(word)

In [3]:
class Document:
    def __init__(self, init_dict):
        self.title = init_dict.get('title', '')
        self.description = init_dict.get('description', '')
        self.url = init_dict.get('url', '')
        self.site = init_dict.get('site', '')
        self.ts = datetime.fromtimestamp(init_dict['ts']) if 'ts' in init_dict else -1
        self.words = []
        for word in split_words_v3(self.title):
            self.words +=  word
        for word in split_words_v3(self.description):
            self.words +=  word
    
    def __str__(self):
        res = ''
        res += 'url : %s\n' % self.url
        res += 'date : %s\n' % self.ts
        res += 'title : %s\n' % self.title
        res += 'description : %s\n' % self.description
        res += 'site : %s\n' % self.site
        return res

In [4]:
fin = gzip.open('dataset_mai.jsonl.gz')
for line in itertools.islice(fin, 10):
    data = json.loads(line.strip())
    print(Document(data))

url : http://bloknot-volzhsky.ru/news/volzhane-mogut-podat-zayavlenie-na-letnie-putevki-
date : 2019-11-30 18:26:10
title : –í–æ–ª–∂–∞–Ω–µ –º–æ–≥—É—Ç –ø–æ–¥–∞—Ç—å –∑–∞—è–≤–ª–µ–Ω–∏–µ –Ω–∞ –ª–µ—Ç–Ω–∏–µ –ø—É—Ç–µ–≤–∫–∏ –¥–ª—è –¥–µ—Ç–µ–π
description : –° –ø–æ–Ω–µ–¥–µ–ª—å–Ω–∏–∫–∞ –∑–∞—è–≤–ª–µ–Ω–∏—è –Ω–∞—á–∏–Ω–∞—é—Ç –ø—Ä–∏–Ω–∏–º–∞—Ç—å –≤ –ú–§–¶
site : bloknot-volzhsky.ru

url : https://trikky.ru/test-na-znanie-russkogo-yazyka-423354.html
date : 2019-11-30 18:26:48
title : üíó–¢–µ—Å—Ç –Ω–∞ –∑–Ω–∞–Ω–∏–µ —Ä—É—Å—Å–∫–æ–≥–æ —è–∑—ã–∫–∞üíó
description : –¢–µ—Å—Ç —Å–æ —Å–ª–æ–∂–Ω—ã–º–∏ –∏ –ª–µ–≥–∫–∏–º–∏ –≤–æ–ø—Ä–æ—Å–∞–º–∏. –î–ª—è –∫–æ–≥–æ-—Ç–æ –±—É–¥–µ—Ç –ª–µ–≥–∫–æ –Ω–∞–±—Ä–∞—Ç—å –≤—Å–µ 100 –±–∞–ª–ª–æ–≤, –∞ –∫–æ–º—É-—Ç–æ –±—É–¥–µ—Ç –Ω–µ–º–Ω–æ–≥–æ —Ç—è–∂–µ–ª–æ. –í –ª—é–±–æ–º —Å–ª—É—á–∞–µ –ø–æ–ø—Ä–æ–±–æ–≤–∞—Ç—å —Å—Ç–æ–∏—Ç.1. –ß—Ç–æ –∏–∑—É—á–∞–µ—Ç —Ñ—Ä–∞–∑–µ–æ–ª–æ–≥–∏—è? —Å–ø–æ—Å–æ–±—ã –æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è —Å–ª–æ–≤ —É—Å—Ç–æ–π—á–∏–≤—ã–µ —Å–æ—á–µ—Ç–∞–Ω–∏—è —Å–ª–æ–≤ —á–∞—Å—Ç–∏ —Ä–µ—á–∏2. –ù–∞ –º–µ—Å—Ç–µ 

In [5]:
fin = gzip.open('dataset_mai.jsonl.gz')
dataset = []
dataset_test = []
for line in itertools.islice(fin, 10000):
    data = json.loads(line.strip())
    dataset.append(Document(data))

## –î–ó - —Ä–µ–∞–ª–∏–∑–æ–≤–∞—Ç—å –ø–æ–∏—Å–∫ –ø–æ—Ö–æ–∂–∏—Ö –¥–æ–∫—É–º–µ–Ω—Ç–æ–≤ –ø–æ —Ç–µ–∫—Å—Ç–æ–≤—ã–º –≤–µ–∫—Ç–æ—Ä–∞–º –∏ –ø–æ word2vec –≤–µ–∫—Ç–æ—Ä–∞–º

In [125]:
def make_set(a_doc, a_data):
    doc_words = list(get_doc_words(a_doc))
    data_words = list(get_doc_words(a_data))
    c = list(set(doc_words) & set(data_words))
    return c

def get_word_match_most_similar_docs(a_doc, a_dataset, a_top_n=10):
    print("\nGET_WORD_MATCH_MOST_SIMILAR_DOCS FUNCTION \n")
    print("original doc: {}".format(a_doc))
    print("**************************************************************************************************************")
    l = len(set(get_doc_words(a_doc)))
    for item in itertools.islice(sorted(a_dataset, key=lambda x: len(make_set(a_doc, x)), reverse=True), a_top_n):
        print("Similar: {}".format(len(make_set(a_doc, item)) / l))
        print(item)
        
#################################################################################################################
def get_dict(a_dataset):
    for item in a_dataset:
        item.words = list(get_doc_words(item))
    res = Dictionary(item.words for item in a_dataset)
    return res

def get_doc_vec(a_doc, a_dictionary):
    indexs_word = a_dictionary.doc2idx(a_doc.words)
    res = np.zeros(len(a_dictionary))
    for i in indexs_word:
        res[i] += 1
    return res

def get_tf_idf_most_similar_doc(a_doc, a_dataset, a_top_n=10):
    # –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –¥–æ–∫—É–º–µ–Ω—Ç–∞ —Å—Ç—Ä–æ–∏—Ç—Å—è –≤–µ–∫—Ç–æ—Ä —Ä–∞–∑–º–µ—Ä–Ω–æ—Å—Ç–∏ —Å–ª–æ–≤–∞—Ä—è (–∞–Ω–∞–ª–æ–≥–∏—á–Ω–æ random forest) –∏ –¥–æ–±–∞–≤–ª—è–µ—Ç—Å—è idf
    # –¥–∞–ª–µ–µ cosine –º–µ–∂–¥—É –¥–æ–∫—É–º–µ–Ω—Ç–∞–º–∏
    print("\nGET_TF_IDF_MOST_SIMILAR_DOC FUNCTION \n")
    print(f'original: \n{a_doc}')
    print("**************************************************************************************************************")
    dictionary = get_dict(a_dataset)
    doc_vec = get_doc_vec(a_doc, dictionary)
    for item in itertools.islice(sorted(a_dataset, key=lambda x: cosine(doc_vec, get_doc_vec(x, dictionary))), a_top_n):
        print(item)

#################################################################################################################
def get_vec_w2v(a_doc, a_data_w2v):
    res = np.zeros(100)
    for word in a_doc.words:
        if word in a_data_w2v:
            res = res + a_data_w2v[word].copy()
    return res / len(a_doc.words)

def get_w2v_most_similar_doc(a_doc, a_dataset, a_top_n=10):
    # —Å—á–∏—Ç–∞–µ—Ç—Å—è —Å—Ä–µ–¥–Ω–∏–π –≤–µ–∫—Ç–æ—Ä –ø–æ –≤—Å–µ–º —Å–ª–æ–≤–∞–º (–º–æ–∂–Ω–æ –ø—Ä–∏ —É—Å—Ä–µ–¥–Ω–µ–Ω–∏–∏ —É—á–∏—Ç—ã–≤–∞—Ç—å idf) cosine(x, doc_w2v)
    print("\nGET_W2V_MOST_SIMILAR_DOC FUNCTION \n")
    print("original doc: {}".format(a_doc))
    print("**************************************************************************************************************")
    for item in a_dataset:
        item.words = list(get_doc_words(item))
    data_w2v = word2vec.Word2Vec([item.words for item in a_dataset], workers=4)
    doc_w2v = get_vec_w2v(a_doc, data_w2v)
    for item in itertools.islice(sorted(a_dataset, key=lambda x: cosine(doc_w2v, get_vec_w2v(x, data_w2v))), a_top_n):
        print(item)

In [9]:
fin = gzip.open('dataset_mai.jsonl.gz')
dataset_w2v = []
dataset = []
for line in itertools.islice(fin, 100000):
    data = json.loads(line.strip())
    dataset.append(Document(data))
    dataset_w2v.append(Document(data))

#### –¢–µ—Å—Ç–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ

In [None]:
doc_id = 13 #dota2
doc_id = 1946 #–≥–æ—Ä–æ—Å–∫–æ–ø
doc_id = 3388 #—Ö–æ–∫–∫–µ–π
doc_id = 7601 #—Ç–µ–ª–µ—Ñ–æ–Ω—ã

In [126]:
doc_id = 13 
for similart_func in [get_word_match_most_similar_docs,  get_tf_idf_most_similar_doc, get_w2v_most_similar_doc]:
    similart_func(dataset[doc_id], dataset)


GET_WORD_MATCH_MOST_SIMILAR_DOCS FUNCTION 

original doc: url : https://cyber.sports.ru/dota2/1080755627.html
date : 2019-11-30 18:26:39
title : –†–µ–∑—É–ª—å—Ç–∞—Ç—ã Parimatch League Dota 2. Virtus.pro –ø–æ–±–µ–¥–∏–ª–∞
description : 30 –Ω–æ—è–±—Ä—è –∑–∞–≤–µ—Ä—à–∏–ª—Å—è —Ç—É—Ä–Ω–∏—Ä¬†Parimatch League. –í —Ñ–∏–Ω–∞–ª–µ Virtus.pro —Ä–∞–∑–≥—Ä–æ–º–∏–ª–∞¬†HellRaisers¬†—Å–æ —Å—á–µ—Ç–æ–º 3:0 –∏ –∑–∞—Ä–∞–±–æ—Ç–∞–ª–∞ 40 —Ç—ã—Å—è—á –¥–æ–ª–ª–∞—Ä–æ–≤. –õ–∞–Ω-—Ñ–∏–Ω–∞–ª Parimatch League –ø—Ä–æ—à–µ–ª —Å 28 –ø–æ 30 –Ω–æ—è–±—Ä—è –≤ –ú–æ—Å–∫–≤–µ. 4 –∫–æ–º–∞–Ω–¥—ã —Ä–∞–∑—ã–≥—Ä–∞–ª–∏ 70 —Ç—ã—Å—è—á –¥–æ–ª–ª–∞—Ä–æ–≤. –†–µ–∑—É–ª—å—Ç–∞—Ç—ã –∫–æ–º–∞–Ω–¥ 1. Virtus.pro2.
site : cyber.sports.ru

**************************************************************************************************************
Similar: 1.0
url : https://cyber.sports.ru/dota2/1080755627.html
date : 2019-11-30 18:26:39
title : –†–µ–∑—É–ª—å—Ç–∞—Ç—ã Parimatch League Dota 2. Virtus.pro –ø–æ–±–µ–¥–∏–ª–∞
description : 30 –Ω–æ—è–±—Ä—è –∑–∞–≤–µ



url : https://cyber.sports.ru/dota2/1080755627.html
date : 2019-11-30 18:26:39
title : –†–µ–∑—É–ª—å—Ç–∞—Ç—ã Parimatch League Dota 2. Virtus.pro –ø–æ–±–µ–¥–∏–ª–∞
description : 30 –Ω–æ—è–±—Ä—è –∑–∞–≤–µ—Ä—à–∏–ª—Å—è —Ç—É—Ä–Ω–∏—Ä¬†Parimatch League. –í —Ñ–∏–Ω–∞–ª–µ Virtus.pro —Ä–∞–∑–≥—Ä–æ–º–∏–ª–∞¬†HellRaisers¬†—Å–æ —Å—á–µ—Ç–æ–º 3:0 –∏ –∑–∞—Ä–∞–±–æ—Ç–∞–ª–∞ 40 —Ç—ã—Å—è—á –¥–æ–ª–ª–∞—Ä–æ–≤. –õ–∞–Ω-—Ñ–∏–Ω–∞–ª Parimatch League –ø—Ä–æ—à–µ–ª —Å 28 –ø–æ 30 –Ω–æ—è–±—Ä—è –≤ –ú–æ—Å–∫–≤–µ. 4 –∫–æ–º–∞–Ω–¥—ã —Ä–∞–∑—ã–≥—Ä–∞–ª–∏ 70 —Ç—ã—Å—è—á –¥–æ–ª–ª–∞—Ä–æ–≤. –†–µ–∑—É–ª—å—Ç–∞—Ç—ã –∫–æ–º–∞–Ω–¥ 1. Virtus.pro2.
site : cyber.sports.ru

url : https://cyber.sports.ru/dota2/1080757051.html
date : 2019-11-30 18:37:09
title : –ü—Ä–∏–∑–æ–≤–æ–π —Ñ–æ–Ω–¥ Parimatch League Dota 2
description : 30 –Ω–æ—è–±—Ä—è –≤ –ú–æ—Å–∫–≤–µ –∑–∞–≤–µ—Ä—à–∏–ª—Å—è —Ç—É—Ä–Ω–∏—Ä¬†Parimatch League. –ü—Ä–∏–∑–æ–≤–æ–π —Ñ–æ–Ω–¥ —Ç—É—Ä–Ω–∏—Ä–∞ ‚Äì 70 —Ç—ã—Å—è—á –¥–æ–ª–ª–∞—Ä–æ–≤. –í —Ñ–∏–Ω–∞–ª–µ¬†Virtus.pro¬†—Ä–∞–∑–≥—Ä–æ–º–∏–ª–∞¬†HellRaisers¬†—Å–æ 

In [127]:
doc_id = 1946 
for similart_func in [get_word_match_most_similar_docs,  get_tf_idf_most_similar_doc, get_w2v_most_similar_doc]:
    similart_func(dataset[doc_id], dataset)


GET_WORD_MATCH_MOST_SIMILAR_DOCS FUNCTION 

original doc: url : https://www.obozrevatel.com/astro/news/goroskop-na-1-dekabrya-chto-zhdet-lvov-rakov-dev-i-drugie-znaki-zodiaka.htm
date : 2019-11-30 17:07:11
title : –ì–æ—Ä–æ—Å–∫–æ–ø –Ω–∞ 1 –¥–µ–∫–∞–±—Ä—è: —á—Ç–æ –∂–¥–µ—Ç –õ—å–≤–æ–≤, –†–∞–∫–æ–≤, –î–µ–≤ –∏ –¥—Ä—É–≥–∏–µ –∑–Ω–∞–∫–∏ –∑–æ–¥–∏–∞–∫–∞
description : –°–æ–≤–µ—Ç—ã –∞—Å—Ç—Ä–æ–ª–æ–≥–æ–≤ –Ω–∞ –≤–æ—Å–∫—Ä–µ—Å–µ–Ω—å–µ
site : obozrevatel.com

**************************************************************************************************************
Similar: 1.0
url : https://www.obozrevatel.com/astro/news/goroskop-na-1-dekabrya-chto-zhdet-lvov-rakov-dev-i-drugie-znaki-zodiaka.htm
date : 2019-11-30 17:07:11
title : –ì–æ—Ä–æ—Å–∫–æ–ø –Ω–∞ 1 –¥–µ–∫–∞–±—Ä—è: —á—Ç–æ –∂–¥–µ—Ç –õ—å–≤–æ–≤, –†–∞–∫–æ–≤, –î–µ–≤ –∏ –¥—Ä—É–≥–∏–µ –∑–Ω–∞–∫–∏ –∑–æ–¥–∏–∞–∫–∞
description : –°–æ–≤–µ—Ç—ã –∞—Å—Ç—Ä–æ–ª–æ–≥–æ–≤ –Ω–∞ –≤–æ—Å–∫—Ä–µ—Å–µ–Ω—å–µ
site : obozrevatel.com

Similar: 0.8125
url : https://www.obozrevatel.c

url : https://www.obozrevatel.com/astro/news/goroskop-na-1-dekabrya-chto-zhdet-lvov-rakov-dev-i-drugie-znaki-zodiaka.htm
date : 2019-11-30 17:07:11
title : –ì–æ—Ä–æ—Å–∫–æ–ø –Ω–∞ 1 –¥–µ–∫–∞–±—Ä—è: —á—Ç–æ –∂–¥–µ—Ç –õ—å–≤–æ–≤, –†–∞–∫–æ–≤, –î–µ–≤ –∏ –¥—Ä—É–≥–∏–µ –∑–Ω–∞–∫–∏ –∑–æ–¥–∏–∞–∫–∞
description : –°–æ–≤–µ—Ç—ã –∞—Å—Ç—Ä–æ–ª–æ–≥–æ–≤ –Ω–∞ –≤–æ—Å–∫—Ä–µ—Å–µ–Ω—å–µ
site : obozrevatel.com

url : https://www.obozrevatel.com/astro/news/goroskop-na-29-noyabrya-chto-zhdet-rakov-lvov-dev-i-drugie-znaki-zodiaka.htm
date : 2019-11-29 00:36:09
title : –ì–æ—Ä–æ—Å–∫–æ–ø –Ω–∞ 29 –Ω–æ—è–±—Ä—è: —á—Ç–æ –∂–¥–µ—Ç –†–∞–∫–æ–≤, –õ—å–≤–æ–≤, –î–µ–≤ –∏ –¥—Ä—É–≥–∏–µ –∑–Ω–∞–∫–∏ –∑–æ–¥–∏–∞–∫–∞
description : –°–æ–≤–µ—Ç—ã –∞—Å—Ç—Ä–æ–ª–æ–≥–æ–≤ –Ω–∞ –ø—è—Ç–Ω–∏—Ü—É
site : obozrevatel.com

url : https://www.obozrevatel.com/astro/news/goroskop-na-28-noyabrya-chto-zhdet-rakov-lvov-dev-i-drugie-znaki-zodiaka1.htm
date : 2019-11-27 23:02:09
title : –ì–æ—Ä–æ—Å–∫–æ–ø –Ω–∞ 28 –Ω–æ—è–±—Ä—è: —á—Ç–æ –∂–¥–µ—Ç –†–∞–∫–æ–≤, –õ—å–≤–æ–≤, –



url : https://www.obozrevatel.com/astro/news/goroskop-na-1-dekabrya-chto-zhdet-lvov-rakov-dev-i-drugie-znaki-zodiaka.htm
date : 2019-11-30 17:07:11
title : –ì–æ—Ä–æ—Å–∫–æ–ø –Ω–∞ 1 –¥–µ–∫–∞–±—Ä—è: —á—Ç–æ –∂–¥–µ—Ç –õ—å–≤–æ–≤, –†–∞–∫–æ–≤, –î–µ–≤ –∏ –¥—Ä—É–≥–∏–µ –∑–Ω–∞–∫–∏ –∑–æ–¥–∏–∞–∫–∞
description : –°–æ–≤–µ—Ç—ã –∞—Å—Ç—Ä–æ–ª–æ–≥–æ–≤ –Ω–∞ –≤–æ—Å–∫—Ä–µ—Å–µ–Ω—å–µ
site : obozrevatel.com

url : https://www.obozrevatel.com/astro/news/goroskop-na-28-noyabrya-chto-zhdet-rakov-lvov-dev-i-drugie-znaki-zodiaka1.htm
date : 2019-11-27 23:02:09
title : –ì–æ—Ä–æ—Å–∫–æ–ø –Ω–∞ 28 –Ω–æ—è–±—Ä—è: —á—Ç–æ –∂–¥–µ—Ç –†–∞–∫–æ–≤, –õ—å–≤–æ–≤, –î–µ–≤ –∏ –¥—Ä—É–≥–∏–µ –∑–Ω–∞–∫–∏ –∑–æ–¥–∏–∞–∫–∞
description : –°–æ–≤–µ—Ç—ã –∞—Å—Ç—Ä–æ–ª–æ–≥–æ–≤ –Ω–∞ —á–µ—Ç–≤–µ—Ä–≥
site : obozrevatel.com

url : https://www.obozrevatel.com/astro/news/goroskop-na-29-noyabrya-chto-zhdet-rakov-lvov-dev-i-drugie-znaki-zodiaka.htm
date : 2019-11-29 00:36:09
title : –ì–æ—Ä–æ—Å–∫–æ–ø –Ω–∞ 29 –Ω–æ—è–±—Ä—è: —á—Ç–æ –∂–¥–µ—Ç –†–∞–∫–æ–≤, –õ—å–≤–æ–≤, –

In [128]:
doc_id = 3388 
for similart_func in [get_word_match_most_similar_docs,  get_tf_idf_most_similar_doc, get_w2v_most_similar_doc]:
    similart_func(dataset[doc_id], dataset)


GET_WORD_MATCH_MOST_SIMILAR_DOCS FUNCTION 

original doc: url : https://www.tv21.ru/news/2019/11/30/khokkeisty-murmana-proveli-pervyy-domashniy-match-na-stadione-stroitel
date : 2019-11-30 15:31:47
title : –•–æ–∫–∫–µ–∏—Å—Ç—ã "–ú—É—Ä–º–∞–Ω–∞" –ø—Ä–æ–≤–µ–ª–∏ –ø–µ—Ä–≤—ã–π –¥–æ–º–∞—à–Ω–∏–π –º–∞—Ç—á –Ω–∞ —Å—Ç–∞–¥–∏–æ–Ω–µ "–°—Ç—Ä–æ–∏—Ç–µ–ª—å"
description : –ò–≥—Ä–∞ —Å–æ—Å—Ç–æ—è–ª–∞—Å—å –≤ —Ä–∞–º–∫–∞—Ö –æ—Ç–∫—Ä—ã—Ç–∏—è –Ω–æ–≤–æ–≥–æ —Å–µ–∑–æ–Ω–∞ –°—É–ø–µ—Ä–ª–∏–≥–∏ –ø–æ —Ö–æ–∫–∫–µ—é —Å –º—è—á–æ–º.
site : tv21.ru

**************************************************************************************************************
Similar: 1.0
url : https://www.tv21.ru/news/2019/11/30/khokkeisty-murmana-proveli-pervyy-domashniy-match-na-stadione-stroitel
date : 2019-11-30 15:31:47
title : –•–æ–∫–∫–µ–∏—Å—Ç—ã "–ú—É—Ä–º–∞–Ω–∞" –ø—Ä–æ–≤–µ–ª–∏ –ø–µ—Ä–≤—ã–π –¥–æ–º–∞—à–Ω–∏–π –º–∞—Ç—á –Ω–∞ —Å—Ç–∞–¥–∏–æ–Ω–µ "–°—Ç—Ä–æ–∏—Ç–µ–ª—å"
description : –ò–≥—Ä–∞ —Å–æ—Å—Ç–æ—è–ª–∞—Å—å –≤ —Ä–∞–º–∫–∞—Ö –æ—Ç–∫—Ä—ã—Ç–∏—è –Ω–æ–≤–

url : https://www.tv21.ru/news/2019/11/30/khokkeisty-murmana-proveli-pervyy-domashniy-match-na-stadione-stroitel
date : 2019-11-30 15:31:47
title : –•–æ–∫–∫–µ–∏—Å—Ç—ã "–ú—É—Ä–º–∞–Ω–∞" –ø—Ä–æ–≤–µ–ª–∏ –ø–µ—Ä–≤—ã–π –¥–æ–º–∞—à–Ω–∏–π –º–∞—Ç—á –Ω–∞ —Å—Ç–∞–¥–∏–æ–Ω–µ "–°—Ç—Ä–æ–∏—Ç–µ–ª—å"
description : –ò–≥—Ä–∞ —Å–æ—Å—Ç–æ—è–ª–∞—Å—å –≤ —Ä–∞–º–∫–∞—Ö –æ—Ç–∫—Ä—ã—Ç–∏—è –Ω–æ–≤–æ–≥–æ —Å–µ–∑–æ–Ω–∞ –°—É–ø–µ—Ä–ª–∏–≥–∏ –ø–æ —Ö–æ–∫–∫–µ—é —Å –º—è—á–æ–º.
site : tv21.ru

url : https://vk.com/@mlive51-v-murmansk-vozvraschaetsya-bolshoi-hokkei
date : 2019-11-29 18:37:54
title : –í –ú—É—Ä–º–∞–Ω—Å–∫ –≤–æ–∑–≤—Ä–∞—â–∞–µ—Ç—Å—è ¬´–±–æ–ª—å—à–æ–π —Ö–æ–∫–∫–µ–π¬ª
description : 30 –Ω–æ—è–±—Ä—è —Ö–æ–∫–∫–µ–π–Ω—ã–π –∫–ª—É–± ¬´–ú—É—Ä–º–∞–Ω¬ª –≤—Å—Ç—Ä–µ—Ç–∏—Ç—Å—è —Å —É–ª—å—è–Ω–æ–≤—Å–∫–æ–π ¬´–í–æ–ª–≥–æ–π¬ª –Ω–∞ —Å—Ç–∞–¥–∏–æ–Ω–µ ¬´–°—Ç—Ä–æ–∏—Ç–µ–ª—å¬ª –≤ —Ä–∞–º–∫–∞—Ö –ß–µ–º–ø–∏–æ–Ω–∞—Ç–∞ –†–æ—Å—Å–∏–∏ –ø–æ —Ö–æ–∫–∫–µ—é —Å –º—è—á–æ–º –°—É–ø–µ—Ä–ª–∏–≥–∏ —Å–µ–∑–æ–Ω–∞ 2019/2020.
site : vk.com

url : https://b-port.com/news/234153
dat



url : https://www.tv21.ru/news/2019/11/30/khokkeisty-murmana-proveli-pervyy-domashniy-match-na-stadione-stroitel
date : 2019-11-30 15:31:47
title : –•–æ–∫–∫–µ–∏—Å—Ç—ã "–ú—É—Ä–º–∞–Ω–∞" –ø—Ä–æ–≤–µ–ª–∏ –ø–µ—Ä–≤—ã–π –¥–æ–º–∞—à–Ω–∏–π –º–∞—Ç—á –Ω–∞ —Å—Ç–∞–¥–∏–æ–Ω–µ "–°—Ç—Ä–æ–∏—Ç–µ–ª—å"
description : –ò–≥—Ä–∞ —Å–æ—Å—Ç–æ—è–ª–∞—Å—å –≤ —Ä–∞–º–∫–∞—Ö –æ—Ç–∫—Ä—ã—Ç–∏—è –Ω–æ–≤–æ–≥–æ —Å–µ–∑–æ–Ω–∞ –°—É–ø–µ—Ä–ª–∏–≥–∏ –ø–æ —Ö–æ–∫–∫–µ—é —Å –º—è—á–æ–º.
site : tv21.ru

url : https://mgimo.ru/about/news/social/mini-futbol-boevaya-nichya-s-akademiey-mchs/
date : 2019-11-29 14:19:06
title : –ú–∏–Ω–∏-—Ñ—É—Ç–±–æ–ª: –±–æ–µ–≤–∞—è –Ω–∏—á—å—è —Å –ê–∫–∞–¥–µ–º–∏–µ–π –ú–ß–°
description : 28 –Ω–æ—è–±—Ä—è —Å–±–æ—Ä–Ω–∞—è –ú–ì–ò–ú–û –ø–æ –º–∏–Ω–∏-—Ñ—É—Ç–±–æ–ª—É –≤ —Ä–∞–º–∫–∞—Ö –ú–æ—Å–∫–æ–≤—Å–∫–∏—Ö —Å—Ç—É–¥–µ–Ω—á–µ—Å–∫–∏—Ö —Å–ø–æ—Ä—Ç–∏–≤–Ω—ã—Ö –∏–≥—Ä –ø—Ä–æ–≤–µ–ª–∞ –º–∞—Ç—á –Ω–∞ –≤—ã–µ–∑–¥–µ —Å –∫–æ–º–∞–Ω–¥–æ–π –ê–ì–ü–°. –° –ø–µ—Ä–≤–æ–π –¥–æ –ø–æ—Å–ª–µ–¥–Ω–µ–π —Å–µ–∫—É–Ω–¥—ã —à–ª–∞ –±–µ—Å–∫–æ–º–ø—Ä–æ–º–∏—Å—Å–Ω–∞—è –±–æ—Ä—å–

In [129]:
doc_id = 7601
for similart_func in [get_word_match_most_similar_docs,  get_tf_idf_most_similar_doc, get_w2v_most_similar_doc]:
    similart_func(dataset[doc_id], dataset)


GET_WORD_MATCH_MOST_SIMILAR_DOCS FUNCTION 

original doc: url : https://megaobzor.com/Stala-izvestna-cena-smartfona-Redmi-K30.html
date : 2019-11-30 12:12:16
title : –°—Ç–∞–ª–∞ –∏–∑–≤–µ—Å—Ç–Ω–∞ —Ü–µ–Ω–∞ —Å–º–∞—Ä—Ç—Ñ–æ–Ω–∞ Redmi K30
description : –ê–≤—Ç–æ—Ä–∏—Ç–µ—Ç–Ω—ã–π –∏—Å–∫–∞—Ç–µ–ª—å —É—Ç–µ—á–µ–∫ –ú—É–∫—É–ª –®–∞—Ä–º–∞ –ø–æ–¥–µ–ª–∏–ª—Å—è –ø–æ–¥—Ä–æ–±–Ω–æ—Å—Ç—è–º–∏ –æ —Ü–µ–Ω–µ —Å–º–∞—Ä—Ç—Ñ–æ–Ω–∞ Redmi K30, –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∞–Ω–æ–Ω—Å –∫–æ—Ç–æ—Ä–æ–≥–æ —Å–æ—Å—Ç–æ–∏—Ç—Å—è —É–∂–µ 10 –¥–µ–∫–∞–±—Ä—è. –ï—Å–ª–∏ –≤–µ—Ä–∏—Ç—å –∏—Å—Ç–æ—á–Ω–∏–∫—É, –∞–ø–ø–∞—Ä–∞—Ç –æ–±–æ–π–¥–µ—Ç—Å—è –≤ 327 –¥–æ–ª–ª–∞—Ä–æ–≤, —á—Ç–æ –Ω–∞–º–Ω–æ–≥–æ –±–æ–ª—å—à–µ 285 –¥–æ–ª–ª–∞—Ä–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ –µ–º—É –ø—Ä–∏–ø–∏—Å—ã–≤–∞–ª–∏ —Ä–∞–Ω–µ–µ. –ü–æ –ø—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω—ã–º –¥–∞–Ω–Ω—ã–º, Redmi K30 –ø–æ–ª—É—á–∏—Ç –∞–∫–∫—É–º—É–ª—è—Ç–æ—Ä —ë–º–∫–æ—Å—Ç—å—é 5000 –º–ê—á, –∫–≤–∞–¥—Ä–æ–∫–∞–º–µ—Ä—É —Å –≥–ª–∞–≤–Ω—ã–º –¥–∞—Ç—á–∏–∫–æ–º –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è —Ä–∞–∑—Ä–µ—à–µ–Ω–∏–µ–º 60 –ú–ø, –¥–∏—Å–ø–ª–µ–π —Å —á–∞—Å—Ç–æ—Ç–æ–π

url : https://megaobzor.com/Stala-izvestna-cena-smartfona-Redmi-K30.html
date : 2019-11-30 12:12:16
title : –°—Ç–∞–ª–∞ –∏–∑–≤–µ—Å—Ç–Ω–∞ —Ü–µ–Ω–∞ —Å–º–∞—Ä—Ç—Ñ–æ–Ω–∞ Redmi K30
description : –ê–≤—Ç–æ—Ä–∏—Ç–µ—Ç–Ω—ã–π –∏—Å–∫–∞—Ç–µ–ª—å —É—Ç–µ—á–µ–∫ –ú—É–∫—É–ª –®–∞—Ä–º–∞ –ø–æ–¥–µ–ª–∏–ª—Å—è –ø–æ–¥—Ä–æ–±–Ω–æ—Å—Ç—è–º–∏ –æ —Ü–µ–Ω–µ —Å–º–∞—Ä—Ç—Ñ–æ–Ω–∞ Redmi K30, –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω—ã–π –∞–Ω–æ–Ω—Å –∫–æ—Ç–æ—Ä–æ–≥–æ —Å–æ—Å—Ç–æ–∏—Ç—Å—è —É–∂–µ 10 –¥–µ–∫–∞–±—Ä—è. –ï—Å–ª–∏ –≤–µ—Ä–∏—Ç—å –∏—Å—Ç–æ—á–Ω–∏–∫—É, –∞–ø–ø–∞—Ä–∞—Ç –æ–±–æ–π–¥–µ—Ç—Å—è –≤ 327 –¥–æ–ª–ª–∞—Ä–æ–≤, —á—Ç–æ –Ω–∞–º–Ω–æ–≥–æ –±–æ–ª—å—à–µ 285 –¥–æ–ª–ª–∞—Ä–æ–≤, –∫–æ—Ç–æ—Ä—ã–µ –µ–º—É –ø—Ä–∏–ø–∏—Å—ã–≤–∞–ª–∏ —Ä–∞–Ω–µ–µ. –ü–æ –ø—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω—ã–º –¥–∞–Ω–Ω—ã–º, Redmi K30 –ø–æ–ª—É—á–∏—Ç –∞–∫–∫—É–º—É–ª—è—Ç–æ—Ä —ë–º–∫–æ—Å—Ç—å—é 5000 –º–ê—á, –∫–≤–∞–¥—Ä–æ–∫–∞–º–µ—Ä—É —Å –≥–ª–∞–≤–Ω—ã–º –¥–∞—Ç—á–∏–∫–æ–º –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è —Ä–∞–∑—Ä–µ—à–µ–Ω–∏–µ–º 60 –ú–ø, –¥–∏—Å–ø–ª–µ–π —Å —á–∞—Å—Ç–æ—Ç–æ–π –æ–±–Ω–æ–≤–ª–µ–Ω–∏—è 120 –ì—Ü, –¥–≤–æ–π–Ω—É—é —Ñ—Ä–æ–Ω—Ç–∞



url : https://megaobzor.com/Redmi-ne-stali-zamorachivatsja-s-reklamoi-Redmi-K30.html
date : 2019-11-28 14:06:22
title : Redmi –Ω–µ —Å—Ç–∞–ª–∏ –∑–∞–º–æ—Ä–∞—á–∏–≤–∞—Ç—å—Å—è —Å —Ä–µ–∫–ª–∞–º–æ–π Redmi K30
description : –ë—Ä–µ–Ω–¥ Redmi –Ω–∞—á–∞–ª —Ä–µ–∫–ª–∞–º–Ω—É—é –∫–∞–º–ø–∞–Ω–∏—é —Å–º–∞—Ä—Ç—Ñ–æ–Ω–∞ Redmi K30, –∫–æ—Ç–æ—Ä—ã–π –±—É–¥–µ—Ç –æ—Ñ–∏—Ü–∏–∞–ª—å–Ω–æ –ø—Ä–µ–¥—Å—Ç–∞–≤–ª–µ–Ω 10 –¥–µ–∫–∞–±—Ä—è. –ó–∞–±–∞–≤–Ω–æ, —á—Ç–æ —Ç–∏–∑–µ—Ä –ø—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏ –∏–¥–µ–Ω—Ç–∏—á–µ–Ω —Ä–µ–∫–ª–∞–º–µ –ø—Ä–µ–¥—à–µ—Å—Ç–≤–µ–Ω–Ω–∏–∫–∞ Redmi K20 ‚Äì –±–æ–∫—Å–µ—Ä—Å–∫–∏–µ –ø–µ—Ä—á–∞—Ç–∫–∏ –∏ –Ω–∞–¥–ø–∏—Å—å –æ –≥–æ—Ç–æ–≤–Ω–æ—Å—Ç–∏ –æ—Ç–ø—Ä–∞–≤–∏—Ç—å –≤ –Ω–æ–∫–∞—É—Ç –∫–æ–Ω–∫—É—Ä–µ–Ω—Ç–æ–≤. –ü–æ –ø—Ä–µ–¥–≤–∞—Ä–∏—Ç–µ–ª—å–Ω—ã–º –¥–∞–Ω–Ω—ã–º, Redmi K30 –ø–æ–ª—É—á–∏—Ç –∞–∫–∫—É–º—É–ª—è—Ç–æ—Ä —ë–º–∫–æ—Å—Ç—å—é 5000 –º–ê—á, –∫–≤–∞–¥—Ä–æ–∫–∞–º–µ—Ä—É —Å –≥–ª–∞–≤–Ω—ã–º –¥–∞—Ç—á–∏–∫–æ–º –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è —Ä–∞–∑—Ä–µ—à–µ–Ω–∏–µ–º 60 –ú–ø, –¥–∏—Å–ø–ª–µ–π —Å —á–∞—Å—Ç–æ—Ç–æ–π –æ–±–Ω–æ–≤–ª–µ–Ω–∏—è 120 –ì—Ü, –¥–≤–æ–π–Ω—É—é —Ñ—