In [1]:
import nltk
import artm
import re
from tqdm.notebook import tqdm
import string
from topicnet.cooking_machine.dataset import Dataset
from tokenizer import SpacyRulesRussianTokenizer
from lemmatizer import Pymorphy2Lemmatizer

import pandas as pd
from glob import glob

HABR_DATA_PATH = '/home/sultan/datasets/habr/'
files = glob(HABR_DATA_PATH+'*.txt')

### Избавляемся от лишних символов и оставляем только raw_text

In [2]:
cyrilic_letters = "аАбБвВгГдДеЕёЁжЖзЗиИйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩъЪыЫьЬэЭюЮяЯ"

In [3]:
right_symbols = string.ascii_letters + cyrilic_letters + ' '
data = []
for path in tqdm(files):
    entry = {}
    entry['id'] = path.split('/')[-1].split('.')[0]
    with open(path,'r') as f:
        next(f)
        next(f)
        text = f.read().replace("\n", ' ')
        text = ''.join([char for char in text if char in right_symbols])
        text = ' '.join(text.split())
        entry['raw_text'] = ''.join(text.split('\n'))
    data.append(entry)
habr_texts = pd.DataFrame(data)
print(habr_texts.columns)

HBox(children=(IntProgress(value=0, max=10706), HTML(value='')))


Index(['id', 'raw_text'], dtype='object')


### Делаем токенизацию и лемматизацию

In [4]:
lemmatized_text = []
lemmatizer = Pymorphy2Lemmatizer()
tokenizer = SpacyRulesRussianTokenizer()
for text in tqdm(habr_texts['raw_text'].values):
    text = tokenizer.transform_element(text)
    lemmatized = lemmatizer.transform_string(text)
    lemmatized_text.append(lemmatized)
habr_texts['lemmatized'] = lemmatized_text

HBox(children=(IntProgress(value=0, max=10706), HTML(value='')))




### Переводим лемматизированный текст к Vowpal Wabbit

In [5]:
vw_text = []
for index, data in tqdm(habr_texts.iterrows()):
    vw_string = ''
    doc_id = data.id
    lemmatized = '@word ' + ' '.join(data.lemmatized)
    vw_string = ' |'.join([doc_id, lemmatized])
    vw_text.append(vw_string)
habr_texts['vw_text'] = vw_text

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




### Сохраняем в формате csv

In [6]:
#DATA_PATH_SUB = 'DataPreparation.csv'
DATA_PATH_SUB = 'DataPreparation.csv'
habr_texts[['id','raw_text', 'vw_text']].to_csv(DATA_PATH_SUB)

### Получаем словарь с помощью get_dictionary для последующей "очистки" текста от слишком частотных и слишком редких слов

In [7]:
dataset = Dataset(DATA_PATH_SUB)
dictionary = dataset.get_dictionary()



In [8]:
dictionary.save_text("dict.txt")
! head dict.txt

name: afb2b286-393a-4fce-b247-af15f442eb02 num_items: 10706
token, class_id, token_value, token_tf, token_df
bittorrentвложение, @word, 4.954623022968008e-07, 1.0, 1.0
myfabrik, @word, 4.954623022968008e-07, 1.0, 1.0
effortless, @word, 4.954623022968008e-07, 1.0, 1.0
zapr, @word, 9.909246045936015e-07, 2.0, 1.0
родственик, @word, 4.954623022968008e-07, 1.0, 1.0
завяк, @word, 4.954623022968008e-07, 1.0, 1.0
полухакерский, @word, 4.954623022968008e-07, 1.0, 1.0
гастарбайтер, @word, 4.954623022968008e-07, 1.0, 1.0


In [9]:
df = pd.read_csv("dict.txt", skiprows=[0])
df.shape

(83880, 5)

In [10]:
df[df[" class_id"] == ' @word'].sort_values(by=[" token_df"]).tail()

Unnamed: 0,token,class_id,token_value,token_tf,token_df
37696,что,@word,0.011774,23764.0,6773.0
30446,с,@word,0.011891,23999.0,7106.0
23653,на,@word,0.017903,36133.0,8301.0
34732,и,@word,0.028514,57551.0,8846.0
72022,в,@word,0.03366,67937.0,9184.0


### Выбираем слова от которых мы хотим избавиться и обновляем Vowpal Wabbit

In [11]:
bad_words = list()
for i, data in tqdm(df[df[" token_tf"] > 234].iterrows()):
    bad_words.append(df["token"][i])
for i, data in tqdm(df[df[" token_tf"] < 3].iterrows()):
    bad_words.append(df["token"][i])
print(len(bad_words))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


57042


In [12]:
vw_text = []
for index, data in tqdm(habr_texts.iterrows(), total=habr_texts.shape[0]):
    vw_string = ''
    doc_id = data.id
    good_words = [word for word in data.lemmatized if word not in bad_words]
    lemmatized = '@word ' + ' '.join(good_words)
    vw_string = ' |'.join([doc_id, lemmatized])
    vw_text.append(vw_string)
habr_texts['vw_text'] = vw_text

HBox(children=(IntProgress(value=0, max=10706), HTML(value='')))




### Записываем в файл

In [13]:
DATA_PATH = 'ReadyData.csv'
habr_texts[['id','raw_text', 'vw_text']].to_csv(DATA_PATH)