## Задание

1. Мы будем работать с (частичными) данными lenta.ru отсюда: https://www.kaggle.com/yutkin/corpus-of-russian-news-articles-from-lenta/
2. Проведите препроцессинг текста. Разбейте данные на train и test для задачи классификации (в качестве метки класса будем использовать поле topic). В качестве данных для классификации в пунктах 3 и 5 возьмите
    - только заголовки (title)
    - только тексты новости (text)
    - и то, и другое
3. Обучите fastText для классификации текстов по темам. Сравните качество для разных данных из п. 2.
4. Обучите свою модель w2v (или возьмите любую подходящую предобученную модель). Реализуйте функцию для вычисления вектора текста / заголовка / текста+заголовка как среднего вектора входящих в него слов. 
     - (Бонус) Модифицируйте функцию вычисления среднего вектора: взвешивайте вектора слов соответствующими весами tf-idf.
5. Обучите на полученных средних векторах алгоритм классификации, сравните полученное качество с классификатором fastText. 

In [1]:
# !kaggle datasets download -d yutkin/corpus-of-russian-news-articles-from-lenta
# !unzip data/corpus-of-russian-news-articles-from-lenta.zip -d data/

In [30]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_path = 'data/'

lenta = pd.read_csv(data_path + 'lenta-ru-news.csv', usecols=['title', 'text', 'topic'])
lenta = lenta[lenta['topic'].notna()]

lenta = lenta.sample(frac=1, random_state=42)

  lenta = pd.read_csv(data_path + 'lenta-ru-news.csv', usecols=['title', 'text', 'topic'])


In [3]:
lenta.shape

(738973, 3)

In [5]:
label_dict = {}

for i, topic in enumerate(lenta['topic'].unique()):
    label_dict[topic] = i

lenta['label'] = lenta['topic'].apply(lambda x: label_dict[x])

label_dict

{'Мир': 0,
 'Наука и техника': 1,
 'Культура': 2,
 'Силовые структуры': 3,
 'Россия': 4,
 'Спорт': 5,
 'Бизнес': 6,
 'Путешествия': 7,
 'Бывший СССР': 8,
 'Дом': 9,
 'Экономика': 10,
 'Интернет и СМИ': 11,
 'Из жизни': 12,
 'Ценности': 13,
 'Культпросвет ': 14,
 '69-я параллель': 15,
 'Крым': 16,
 'Библиотека': 17,
 'Легпром': 18,
 'Оружие': 19,
 'МедНовости': 20,
 'ЧМ-2014': 21,
 'Сочи': 22}

In [6]:
from nltk import tokenize

tokenizer = tokenize.NLTKWordTokenizer()

In [7]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from string import punctuation

noise = stopwords.words('russian') + list(punctuation)
splitters = ['\'\'', '``', '\"', '-', '\'', '\`']

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dalabaya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
import pymorphy3
morph = pymorphy3.MorphAnalyzer()

def morphling_lemmatizer(word):
    parsed_word = morph.parse(word)[0]
    lemma = parsed_word.normal_form
    return lemma

morphling_lemmatizer('деревьев')

'дерево'

In [9]:
from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer("russian")

def ru_stemmer(word):
    return snowball_stemmer.stem(word)

ru_stemmer('задержание')

'задержан'

In [10]:
def preprocess(sentence): 
    for splitter in splitters:
        sentence = sentence.replace(splitter, ' ')  
    tokens = tokenizer.tokenize(sentence.lower())  
    clean_tokens = [token.strip() for token in tokens if token not in noise]
    lemma_tokens = [morphling_lemmatizer(token) for token in clean_tokens]  
    # stemmed_tokens = [ru_stemmer(token) for token in clean_tokens]
    return ' '.join(lemma_tokens)

preprocess('как открыть карты')

'открыть карта'

In [26]:
def preprocess_and_save():
    title_preprocessed = [preprocess(str(sentence)) for sentence in tqdm(lenta['title'], desc='Preprocessing titles ...')]
    text_preprocessed = [preprocess(str(sentence)) for sentence in tqdm(lenta['text'], desc='Preprocessing text ...')]

    preprocessed_lenta = pd.DataFrame({
        'title' : title_preprocessed,
        'text' : text_preprocessed,
        'label' : lenta['label']
    })

    preprocessed_lenta.to_csv(data_path + 'preprocessed_lenta.csv')

In [28]:
def get_test_and_save_train(X_df, y_df, train_path):
    X, y = X_df.tolist(), y_df.tolist()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

    with open(data_path + train_path + '.txt', 'w', encoding='utf-8') as file:
        for X_entry, y_entry in zip(X_train, y_train):
            X_entry = str(X_entry).replace('\n', ' ').replace('\r', ' ')
            file.write('__label__' + str(y_entry) + ' ' + X_entry)
            file.write('\n')

    return X_test, y_test

preprocessed_lenta = pd.read_csv(data_path+'preprocessed_lenta.csv')

X_test, y_test = get_test_and_save_train(preprocessed_lenta['text'], preprocessed_lenta['label'], train_path='lenta_train')

In [29]:
# ! git clone https://github.com/facebookresearch/fastText.git
# ! pip3 install fastText/.

In [16]:
import fasttext

ft_model = fasttext.train_supervised(
    input=data_path+'lenta_train.txt',
    label='__label__',
    lr=0.5,
    epoch=25,
    wordNgrams=2, 
    dim=200,
    thread=2,
    verbose=3000
)

ft_model.save_model(data_path+'lenta_model.bin')

Read 79M words
Number of words:  835630
Number of labels: 22
Progress:   0.6% words/sec/thread:  885059 lr:  0.496773 avg.loss:  1.102168 ETA:   0h18m29s

Progress: 100.0% words/sec/thread: 1099731 lr:  0.000000 avg.loss:  0.076652 ETA:   0h 0m 0s  6.8% words/sec/thread:  987053 lr:  0.465807 avg.loss:  0.536746 ETA:   0h15m33s  6.9% words/sec/thread:  988233 lr:  0.465365 avg.loss:  0.534122 ETA:   0h15m31s  7.3% words/sec/thread:  993596 lr:  0.463265 avg.loss:  0.522071 ETA:   0h15m21s


In [17]:
ft_model.predict('армия')[0]

('__label__1',)

In [35]:
def predict(test_df, test_label, selected_model, desc='Unknown parameters model'):
    predicted_labels = selected_model.predict(test_df)[0]
    predicted_labels = [int(label[0][9:]) for label in predicted_labels]

    print(desc, ' => predictions')
    print("Accuracy score: ", accuracy_score(test_label, predicted_labels))
    print("Precision score: ", precision_score(test_label, predicted_labels, average='weighted'))
    print("Recall score: ", recall_score(test_label, predicted_labels, average='weighted'))
    print("f1-score: ", f1_score(test_label, predicted_labels, average='weighted'))

    report = classification_report(test_label, predicted_labels)
    print("\nClassification Report:\n", report)

In [36]:
import fasttext
saved_model = fasttext.load_model(data_path+'lenta_model.bin')

In [38]:
predict(X_test, y_test, saved_model, desc='Lemma on title and text columns, fastText')

Lemma on title and text columns, fastText  predictions
Accuracy score:  0.86475880136838
Precision score:  0.8641386185876371
Recall score:  0.86475880136838
f1-score:  0.8641073965617476

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.87      0.85     33962
           1       0.88      0.89      0.89     13271
           2       0.90      0.90      0.90     13507
           3       0.80      0.71      0.75      4876
           4       0.85      0.86      0.86     40259
           5       0.97      0.97      0.97     16134
           6       0.74      0.64      0.69      1816
           7       0.85      0.78      0.81      1577
           8       0.88      0.88      0.88     13191
           9       0.91      0.88      0.90      5446
          10       0.89      0.89      0.89     20004
          11       0.82      0.79      0.81     11248
          12       0.72      0.68      0.70      6855
          13       0.93      0