In [None]:
from collections import defaultdict
import os
from tqdm.notebook import tqdm
from pyaspeller import YandexSpeller
import codecs
import json
import pickle
import re
import requests
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool 
from multiprocessing.dummy import Lock as ThreadLock 
from multiprocessing.dummy import Value as ThreadValue
import functools
from string import punctuation
from re import escape
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

stemmerR = SnowballStemmer("russian")
stemmerE = SnowballStemmer("english")
stop_words = set(nltk.corpus.stopwords.words(["russian", "english"]))
punctuation = punctuation + '«»©—.'
def illustration(func):
    """
    Распаралеливание выкачки страниц.
    """
    mutex = ThreadLock()
    n_thread = ThreadValue('i',0)
    @functools.wraps(func)
    def wrapper(*args, **argv):
        result = func(*args, **argv)
        with mutex:
            nonlocal n_thread
            n_thread.value +=1
            print(f"\r{n_thread.value} objects are processed...",end ='',flush = True)
        return result
    return wrapper

def dopparse(ad1):
    file = 'content/content/{}'.format(ad1)
    request = doc_to_text[file]
    result = [morph.parse(word)[0].normal_form for word in request]
    final_text[file] = ' '.join(result)
@illustration
def final_dopparse(ad):
    try:
        dopparse(ad)
    except:
        print('hello')
        mistakes.append(ad)
file_name = []
for paths in tqdm(os.listdir('content/content')):
    for path in os.listdir('content/content' + '/' + paths):
        file_name.append(paths +'/' + path)


In [None]:
docid_urls = defaultdict(str)
with open('urls.numerate.txt') as f:
    for url in f:
        line = url[:-1].split('\t')
        docid_urls[int(line[0])] = line[1]
        
url_docname = defaultdict(str)
for paths in tqdm(os.listdir('content/content')):
    for path in os.listdir('content/content' + '/' + paths):
        with codecs.open('content/content/' + paths + '/' + path,encoding = 'utf-8') as f:
            url_docname[next(f)[:-1]] = 'content/content/' + paths + '/' + path
# with open('url_docname.pkl','wb') as f:
#     f.dump(url_docname)
# with open('docid_urls.pkl','wb') as f:
#     f.dump(docid_urls)

In [None]:
punctuation = punctuation + '«»©—. …”'
punctuation = escape(punctuation)

In [None]:
# with open('doc_to_text.pkl','wb') as f:
#     pickle.dump(doc_to_text,f)

# with open('doc_to_text.pkl','rb') as f:
#     doc_to_text = pickle.load(f)

In [None]:
def change_mistakes(string):
    req = requests.post('https://speller.yandex.net/services/spellservice.json/checkText',data = {'text':string})
    new_string = string
    minus_len = 0
    for word in req.json():
        new_string = new_string[:word['pos'] - minus_len] + word['s'][0] + new_string[word['pos'] - minus_len + word['len']:]
        minus_len += word['len'] - len(word['s'][0]) 
    return new_string
def russ(word):
    lit=set('абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ1234567890')
    if word[0] not in lit:
        return False
    return True
id_queries = defaultdict(str)
with codecs.open('queries.numerate.txt',encoding = 'utf-8') as f:
    for url in tqdm(f):
        line = url[:-1].split('\t')
        id_queries[int(line[0])] = change_mistakes(line[1])
all_words = set([word for x in id_queries.values() for word in x.split()])
eng_words = []
for word in all_words:
    if not russ(word) and word not in ['http','ru','com'] and len(word)>1:
        eng_words.append(word.lower())

In [None]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    else:
        return True


def Parse(ad1):
    lit=set('qwertyuiopasdfghjklzxcvbnm')
    file = 'content/content/{}'.format(ad1)
    f = codecs.open(file, encoding='utf-8',errors = 'ignore')
    soup = BeautifulSoup(f, 'lxml')
    cont = soup.findAll(text=True)
    visible_texts = filter(tag_visible, cont)
    i = ' '.join(visible_texts)
    result = []
    a=i.strip()
    key = lambda x: x if(x not in punctuation) else ' '   
    res=[''.join([key(c) for c in word]).lower() for word in a.split()]
    for word in res:
        flag = 1
        if word in eng_words:
            result.append(word.lower())
            continue
        counter = 0
        for j in word.lower():
            if (j not in lit):
                counter += 1
        if counter == len(word):
            result.append(word.lower())
#     request = global_mistakes(' '.join(result)).split()
#     result = [morph.parse(word)[0].normal_form for word in request]
    doc_to_text[file] = ' '.join(result)
@illustration
def final_Parse(ad):
    try:
        Parse(ad)
    except:
        print('hello')
        mistakes.append(ad)

In [None]:
doc_to_text = defaultdict(str)
mistakes = []
with ThreadPool(10) as pool: 
    pool.map(final_Parse, file_name)

In [None]:
with open('doc_to_text.pkl','wb') as f:
    pickle.dump(doc_to_text,f)

In [None]:
for_sub = [r'\xad',r'…',r'\u200b',r'¦',r'”',r'“','→','★']

In [None]:
words = set()
for key in tqdm(doc_to_text.keys()):
    for word in doc_to_text[key].split():
        new_word = word
        for s in for_sub:
            new_word = re.sub(s,'',new_word)
        words.add(new_word)

In [None]:
dict_for_lemma = {}
@illustration
def lemma_for_word(word):
    dict_for_lemma[word] = morph.parse(word)[0].normal_form

In [None]:
# dict_for_lemma = {}
# with ThreadPool(10) as pool: 
#     pool.map(lemma_for_word, words)

In [None]:
dict_for_lemma_cirkle = defaultdict(str)
for i in tqdm(words):
    dict_for_lemma_cirkle[i] = morph.parse(i)[0].normal_form

In [None]:
doc_to_text_new = {}
def test_lemma(ad1):
    try:
        file = 'content/content/{}'.format(ad1)
        request = doc_to_text[file].split()
        result = ' '.join([dict_for_lemma_cirkle[word] for word in request if word])
        doc_to_text_new[ad1] =  result
    except:
        print(ad1)

In [None]:
for ad in tqdm(file_name[:1000]):
    test_lemma(ad)

In [None]:
all_lemms = set()
with codecs.open('Полная парадигма. Морфология. Орфоэпия. Частотность.txt',encoding = 'Windows-1251') as f:
    i = 0
    for line in tqdm(f):
        i+=1
        if line[0] != ' ':
            for k in line.split('|')[0].split():
                all_lemms.add(k)
lems = set()
for word in tqdm(all_lemms):
    lems.add(morph.parse(word)[0].normal_form)

In [None]:
my_lemms = set(dict_for_lemma_cirkle.values())
counter = 0
er = set()
for word in tqdm(my_lemms):
    if (word not in lems) and not word.isdigit():
        counter  += 1
        er.add(word)

In [None]:
counter

In [None]:
def illustration(func):
    """
    Распаралеливание выкачки страниц.
    """
    mutex = ThreadLock()
    n_thread = ThreadValue('i',0)
    @functools.wraps(func)
    def wrapper(*args, **argv):
        result = func(*args, **argv)
        with mutex:
            nonlocal n_thread
            n_thread.value +=1
            if n_thread.value % 10 == 0:
                print(f"\r{n_thread.value} objects are processed...",end ='',flush = True)
        return result
    return wrapper
mistakes = []
@illustration
def change_mistakes(string):
    try:
        req = requests.post('https://speller.yandex.net/services/spellservice.json/checkText',data = {'text':string})
        new_string = string
        minus_len = 0
        for word in req.json():
            new_string = new_string[:word['pos'] - minus_len] + word['s'][0] + new_string[word['pos'] - minus_len + word['len']:]
            minus_len += word['len'] - len(word['s'][0]) 
        error[string] = new_string
    except:
        print('ERROR!')
        mistakes.append(string)
error = {}
with ThreadPool(20) as pool: 
    pool.map(change_mistakes, er)

In [None]:
# with open('error.pkl','wb') as f:
#     pickle.dump(error,f)

In [None]:
with ThreadPool(20) as pool: 
    pool.map(change_mistakes, mistakes)

In [None]:
mistakes_n = set()
for i in mistakes:
    if i not in error:
        mistakes_n.add(i)

In [None]:
mistakes_n

In [None]:
er_count = 0
real_error = {}
for k in error.keys():
    if k != error[k]:
        er_count += 1
        real_error[k] = error[k]

In [None]:
er_files = []
for i in file_name:
    if 'content/content/' + i not in doc_to_text:
        print(i)
        er_files.append(i)

In [None]:
words = set()
for key in tqdm(doc_to_text.keys()):
    for word in doc_to_text[key].split():
        new_word = word
        for s in for_sub:
            new_word = re.sub(s,'',new_word)
        words.add(new_word)

In [None]:
with open('words.pkl','wb') as f:
    pickle.dump(words,f)

In [None]:
for i in tqdm(words):
    dict_for_lemma_cirkle[i] = morph.parse(i)[0].normal_form

In [None]:
with open('lemms.pkl','wb') as f:
    pickle.dump(dict_for_lemma_cirkle,f)

In [None]:
my_lemms = set(dict_for_lemma_cirkle.values())
counter = 0
er = set()
for word in tqdm(my_lemms):
    if (word not in lems) and not word.isdigit() and word not in real_error:
        counter  += 1
        er.add(word)

In [None]:
new_er = set()
for e in er:
    if e not in error:
        new_er.add(e)

In [None]:
for i in tqdm(new_er):
    try:
        change_mistakes(i)
    except:
        print('Error!',i)

In [None]:
er_count = 0
real_error = {}
for k in error.keys():
    if k != error[k]:
        er_count += 1
        real_error[k] = error[k]

In [None]:
with open('real_error.pkl','wb') as f:
    pickle.dump(real_error,f)

In [None]:
for ad in tqdm(file_name):
    test_lemma(ad)

In [None]:
lemma_error  = {}
for word in tqdm(real_error.keys()):
    lemma_error[word] = morph.parse(real_error[word])[0].normal_form

In [None]:
with open('lemma_error.pkl','wb') as f:
    pickle.dump(lemma_error,f)

In [None]:
list(lemma_error.items())[:100]

In [None]:
len(lemma_error.values())

In [None]:
final_text = {}
def final(ad1):
    try:
        file = 'content/content/{}'.format(ad1)
        request = doc_to_text_new[ad1].split()
        key = lambda x: lemma_error[x] if x in lemma_error else x
        result = ' '.join([key(word) for word in request if word])
        final_text[ad1] =  result
    except:
        print(ad1)

In [None]:
for file in tqdm(file_name):
    final(file)

In [None]:
with open('final_text.pkl','wb') as f:
    pickle.dump(final_text,f)

In [None]:
final_text[file_name[13]]