In [1]:
import nltk
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.neighbors import KernelDensity
from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from nltk.collocations import *

In [3]:
df_text = pd.read_excel('ru_texts.xlsx')

In [1]:
def tidy(line):
    line = line.lower().strip()
    line = re.sub(r'<(.*?)>',' ',line)
    line = re.sub(r'[^\u0400-\u04FF\s.,!?]', '', line) 
    line = re.sub(r'[.]{1}','',line)
    line = re.sub(r'[\d]', '', line)
    line = re.sub(r'[a-zA-Z]', '', line)
    line = line.replace('"','').replace('“','').replace('”','').replace('’','').replace('‘','').replace('—','').replace('-','').replace(':', '').replace(';','').replace('(', '').replace(')', '').replace('»', '').replace('«', '').replace('*','')
    if len(line) != 0:
        return(line.strip())
    else:
        return ''

In [7]:
df_text = pd.read_excel('ru_texts.xlsx')
literatura = df_text['text'].tolist()

with open('pure_anekdots.txt', 'r', encoding='utf-8') as f:
    anekdoty = f.read()

In [9]:
anekdoty_blocks = [i for i in open('pure_anekdots.txt', 'r', encoding = 'utf-8').read().split('\n\n') if i != '']
segmenter = Segmenter()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
ner_tagger = NewsNERTagger(emb)
morph_vocab = MorphVocab()

with open('anekdots_trunc.txt', 'w', encoding='utf-8') as f:
    for block in anekdoty_blocks:
        processed_block = prepare_russian_text_without_file(block, segmenter, emb, morph_tagger, ner_tagger, morph_vocab)
        f.write(processed_block + '\n\n\n')

# with open('pure_literature.txt', 'w', encoding='utf-8') as f:
#     for text in literatura:
#         f.write(str(text) + '\n')

In [2]:
from natasha import (
    Segmenter, MorphVocab,
    NewsNERTagger,
    NewsEmbedding,
    NewsMorphTagger,
    Doc
)
import re
import glob
import tqdm

In [8]:
def prepare_russian_text_without_file(raw_text, segmenter, emb, morph_tagger, ner_tagger, morph_vocab):    
    label_dict = {'NUM': '0 ', 'PRON': '1 ', 'PER': '2 ', 'LOC': '3 ', 'ORG': '4 '}
    next_label_num = 5

    raw_text = re.sub(r'\d+', '0' , raw_text)

    doc = Doc(raw_text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)

    for span in reversed(doc.ner.spans):
        if span.type not in label_dict:
            label_dict[span.type] = str(next_label_num)
            next_label_num += 1
        raw_text = "".join((raw_text[:span.start], label_dict[span.type], raw_text[span.stop:]))

    doc = Doc(raw_text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)

    prepared_text = ''
    for token in doc.tokens:
        if token.pos in label_dict:
            prepared_text = ''.join([prepared_text, label_dict[token.pos]])

        elif token.pos != 'PUNCT':
            try:
                token.lemmatize(morph_vocab)
                prepared_text = ''.join([prepared_text, token.lemma.lower(), ' '])
            except Exception as ex:
                prepared_text = ''.join([prepared_text, token.text.lower(), ' '])
    return prepared_text

In [12]:
def prepare_russian_text(input_file, output_file):

    segmenter = Segmenter()
    emb = NewsEmbedding()
    morph_tagger = NewsMorphTagger(emb)
    ner_tagger = NewsNERTagger(emb)
    morph_vocab = MorphVocab()
    
    label_dict = {'NUM': '0', 'PRON': '1', 'PER': '2', 'LOC': '3', 'ORG': '4'}
    next_label_num = 5

    with open(input_file, encoding = "utf-8") as fin:
        raw_text = ' '.join(fin.readlines())

    raw_text = re.sub(r'\d+', '0' , raw_text)

    doc = Doc(raw_text)
    doc.segment(segmenter)
    doc.tag_ner(ner_tagger)

    for span in reversed(doc.ner.spans):
        if span.type not in label_dict:
            label_dict[span.type] = str(next_label_num)
            next_label_num += 1
        raw_text = "".join((raw_text[:span.start], label_dict[span.type], raw_text[span.stop:]))

    doc = Doc(raw_text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)

    prepared_text = ''
    prev_num = False
    for token in doc.tokens:
        if token.pos == 'NUM' and not token.text.isdigit():
            if not prev_num:
                prepared_text += '0'
                prepared_text += ' '
                prev_num = True
            continue

        prev_num = False

        if token.pos in label_dict:
            prepared_text += label_dict[token.pos]
            prepared_text += ' '

        elif token.pos != 'PUNCT':
            try:
                token.lemmatize(morph_vocab)
                prepared_text += token.lemma.lower()
                prepared_text += ' '
            except Exception as ex:
                prepared_text += token.text.lower()
                prepared_text += ' '
    with open(output_file, 'w', encoding = "utf-8") as fout:
       fout.write(prepared_text)

In [13]:
# prepare_russian_text('pure_literature.txt', 'lemma_literature.txt')
prepare_russian_text('pure_anekdots.txt', 'lemma_anekdots_truncated.txt')

In [9]:
anekdots_splitted = [i for i in open('pure_anekdots.txt', 'r', encoding = 'utf-8').read().split('\n\n') if i != '']

In [10]:
print(anekdots_splitted[0:6])

['- Как водичка ?\n- А я здесь как женшина сижу, а не как термометр.', '- Я затрудняюсь поставить вам диагноз ... Наверное, это алкоголизм.\n- Хорошо, доктор. Я приду, когда Вы будете трезвым.', '- Что такое дефицит в маркистском понимании?\n- Это объективная реальность, не данная нам в ощущениях.\n- Это вы в идеалистическом понимании, а в практическом?\n- Объективная реальность данная в ощущениях, но не нам.', '- Можно у вас срочно отремонтировать часы?\n- Нет\n- Что такое\n- Нет\n- А что здесь делают?\n- Здесь делают обрезание\n- Тогда какого черта вы повесили циферблат над входом?\n- А что б вы хотели, чтоб мы там повесили?', '- Из-за тебя я проиграл уйму денег!\n- Почему ты не заговорил?\n- Чудак!\n- Ты только представь, сколько денег мы загребем завтра.', '- Входите,\n- Через 15 минут вам на операцию. А пока отдыхайте.\n- Сестра!\n- Не обьясните мне, почему вы стучали в дверь перед тем, как войти???']


In [3]:
prepare_russian_text('all_jokes_files/TEST_JOKES.txt', 'lemma_test.txt')

In [4]:
prepare_russian_text('all_jokes_files/dataset.txt', 'lemma_jokes_big1.txt')

In [5]:
prepare_russian_text('all_jokes_files/extract_anekdots.txt', 'lemma_jokes_big2.txt')

In [6]:
prepare_russian_text('all_jokes_files/jokes.txt', 'lemma_jokes_big3.txt')

In [7]:
prepare_russian_text('all_jokes_files/jokes_2.txt', 'lemma_jokes_big4.txt')

In [1]:
!spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [2]:
!pip install spacy
!python -m spacy download en_core_web_lg

Collecting spacy
  Obtaining dependency information for spacy from https://files.pythonhosted.org/packages/53/29/d4ba96e8c3032f799f778a83356c4956dc5b99cd72d1300704d71e129879/spacy-3.8.5-cp311-cp311-win_amd64.whl.metadata
  Downloading spacy-3.8.5-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Obtaining dependency information for spacy-legacy<3.1.0,>=3.0.11 from https://files.pythonhosted.org/packages/c3/55/12e842c70ff8828e34e543a2c7176dac4da006ca6901c9e8b43efab8bc6b/spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Obtaining dependency information for spacy-loggers<2.0.0,>=1.0.0 from https://files.pythonhosted.org/packages/33/78/d1a1a026ef3af911159398c939b1509d5c36fe524c7b644f34a5146c4e16/spacy_loggers-1.0.5-py3-none-any.whl.metadata
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
     ---------------------------------------- 0.0/400.7 MB ? eta -:--:--
     -------------------------------------- 0.0/400.7 MB 330.3 kB/s eta 0:20:13
     -------------------------------------- 0.0/400.7 MB 487.6 kB/s eta 0:13:42
     ---------------------------------------- 0.2/400.7 MB 1.1 MB/s eta 0:05:50
     ---------------------------------------- 0.4/400.7 MB 2.4 MB/s eta 0:02:49
     ---------------------------------------- 0.7/400.7 MB 3.4 MB/s eta 0:01:59
     ---------------------------------------- 1.7/400.7 MB 6.3 MB/s eta 0:01:04
     --------------------------------------- 4.4/400.7 MB 13.9 MB/s eta 0:00:29
      -------------------------------------- 7.0/400.7 MB 19.5 MB/s eta 0:00:21
      -------------------------------------- 7.6/400.7 MB 19.3 MB/s eta 0:00:21
      ------------------------

In [2]:
import re
import gensim
import glob
import spacy
spacy.load('en_core_web_lg')
import tqdm

In [3]:
#This code is for decontracting shortened words (won't -> will not)
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won['’‘`]t", "will not", phrase)
    phrase = re.sub(r"can['’‘`]t", "can not", phrase)
    phrase = re.sub(r"ain['’‘`]t", "am not", phrase)

    # general
    phrase = re.sub(r"n['’‘`]t", " not", phrase)
    phrase = re.sub(r"['’‘`]re", " are", phrase)
    phrase = re.sub(r"['’‘`]s", " is", phrase)
    phrase = re.sub(r"['’‘`]d", " would", phrase)
    phrase = re.sub(r"['’‘`]ll", " will", phrase)
    phrase = re.sub(r"['’‘`]t", " not", phrase)
    phrase = re.sub(r"['’‘`]ve", " have", phrase)
    phrase = re.sub(r"['’‘`]m", " am", phrase)

    #phrase = re.sub('([.;!?])', r' \1 ', phrase)
    phrase = re.sub(r'[^\w.?!;]', ' ', phrase)
    phrase = re.sub(' +', ' ', phrase)
    sentences = re.split('([.;!?] *)', phrase)

    return ' '.join([i.capitalize() for i in  sentences])

In [4]:
def prepare_english_text(nlp, input_file, output_file):
    '''
    input_file - it is path to the EXISTING txt file. It shuold contain
    raw text in English

    output_file - it is path to txt file, where the preprocessed text will be written

    '''
    pos_dict = {'PROPN': 'person1', 'PRON': 'pron1', 'NUM': 'ordinal1'}
    fin  = open(input_file, 'r', encoding = 'utf-8')
    with open(output_file, 'w', encoding = 'utf-8') as prepared_text:
        for line in fin:
            preprocessed_text = decontracted(line.strip())
            nlp_doc = nlp(preprocessed_text)
            for token in nlp_doc:
                if token.pos_ in pos_dict:
                    prepared_text.write(pos_dict[token.pos_])
                    prepared_text.write(' ')
                elif token.lemma_.isdigit():
                    prepared_text.write('ordinal1')
                    prepared_text.write(' ')
                elif token.pos_ != 'PUNCT':
                    prepared_text.write(token.lemma_.lower())
                    prepared_text.write(' ')
            prepared_text.write('\n')

In [5]:
nlp = spacy.load("en_core_web_lg")
nlp.max_length = 5000000

prepare_english_text(nlp, 'pure_eng_jokes.txt', 'lemma_eng_jokes.txt')
prepare_english_text(nlp, 'eng_literature.txt', 'lemma_eng_liter.txt')