## Задание

1. Мы будем работать с (частичными) данными lenta.ru отсюда: https://www.kaggle.com/yutkin/corpus-of-russian-news-articles-from-lenta/
2. Проведите препроцессинг текста. Разбейте данные на train и test для задачи классификации (в качестве метки класса будем использовать поле topic). В качестве данных для классификации в пунктах 3 и 5 возьмите
    - только заголовки (title)
    - только тексты новости (text)
    - и то, и другое
3. Обучите fastText для классификации текстов по темам. Сравните качество для разных данных из п. 2.
4. Обучите свою модель w2v (или возьмите любую подходящую предобученную модель). Реализуйте функцию для вычисления вектора текста / заголовка / текста+заголовка как среднего вектора входящих в него слов. 
     - (Бонус) Модифицируйте функцию вычисления среднего вектора: взвешивайте вектора слов соответствующими весами tf-idf.
5. Обучите на полученных средних векторах алгоритм классификации, сравните полученное качество с классификатором fastText. 

In [1]:
# !kaggle datasets download -d yutkin/corpus-of-russian-news-articles-from-lenta
# !unzip data/corpus-of-russian-news-articles-from-lenta.zip -d data/

In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [3]:
data_path = 'data/'

lenta = pd.read_csv(data_path + 'lenta-ru-news.csv', usecols=['title', 'text', 'topic'])
lenta = lenta[lenta['topic'].notna()]

lenta = lenta.sample(frac=1, random_state=42)

In [4]:
lenta.shape

(738973, 3)

In [5]:
label_dict = {}

for i, topic in enumerate(lenta['topic'].unique()):
    label_dict[topic] = i

lenta['label'] = lenta['topic'].apply(lambda x: label_dict[x])

label_dict

{'Мир': 0,
 'Наука и техника': 1,
 'Культура': 2,
 'Силовые структуры': 3,
 'Россия': 4,
 'Спорт': 5,
 'Бизнес': 6,
 'Путешествия': 7,
 'Бывший СССР': 8,
 'Дом': 9,
 'Экономика': 10,
 'Интернет и СМИ': 11,
 'Из жизни': 12,
 'Ценности': 13,
 'Культпросвет ': 14,
 '69-я параллель': 15,
 'Крым': 16,
 'Библиотека': 17,
 'Легпром': 18,
 'Оружие': 19,
 'МедНовости': 20,
 'ЧМ-2014': 21,
 'Сочи': 22}

In [6]:
from nltk import tokenize

tokenizer = tokenize.NLTKWordTokenizer()

In [7]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from string import punctuation

noise = stopwords.words('russian') + list(punctuation)
splitters = ['\'\'', '``', '\"', '-', '\'', '\`']

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dalabaya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
import pymorphy3
morph = pymorphy3.MorphAnalyzer()

def morphling_lemmatizer(word):
    parsed_word = morph.parse(word)[0]
    lemma = parsed_word.normal_form
    return lemma

morphling_lemmatizer('деревьев')

'дерево'

In [9]:
from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer("russian")

def ru_stemmer(word):
    return snowball_stemmer.stem(word)

ru_stemmer('денег')

'денег'

In [10]:
def preprocess(sentence): 
    for splitter in splitters:
        sentence = sentence.replace(splitter, ' ')  
    tokens = tokenizer.tokenize(sentence.lower())  
    clean_tokens = [token.strip() for token in tokens if token not in noise]
    lemma_tokens = [morphling_lemmatizer(token) for token in clean_tokens]  
    # stemmed_tokens = [ru_stemmer(token) for token in clean_tokens]
    return ' '.join(lemma_tokens)

preprocess('как открыть карты')

'открыть карта'

In [11]:
def preprocess_and_save():
    title_preprocessed = [preprocess(str(sentence)) for sentence in tqdm(lenta['title'], desc='Preprocessing titles ...')]
    text_preprocessed = [preprocess(str(sentence)) for sentence in tqdm(lenta['text'], desc='Preprocessing text ...')]

    preprocessed_lenta = pd.DataFrame({
        'title' : title_preprocessed,
        'text' : text_preprocessed,
        'label' : lenta['label']
    })

    preprocessed_lenta.to_csv(data_path + 'preprocessed_lenta.csv')

In [12]:
def get_test_and_save_train(X_df, y_df, train_path, size):
    X = X_df.astype(str).values.tolist()
    y = y_df.astype(str).values.tolist()
    
    X_train, X_test, y_train, y_test = train_test_split(X[:size], y[:size], test_size=0.25, random_state=42, shuffle=True)

    with open(data_path + train_path + '.txt', 'w', encoding='utf-8') as file:
        for X_entry, y_entry in zip(X_train, y_train):
            X_entry = str(X_entry).replace('\n', ' ').replace('\r', ' ')
            file.write('__label__' + str(y_entry) + ' ' + X_entry)
            file.write('\n')

    return X_train, y_train, X_test, y_test

preprocessed_lenta = pd.read_csv(data_path+'preprocessed_lenta.csv', index_col=False)

X_train, y_train, X_test, y_test = get_test_and_save_train(
    preprocessed_lenta['text'], 
    preprocessed_lenta['label'], 
    train_path='lenta_train_title_text_lemma',
    size=200_000
)

In [13]:
# ! git clone https://github.com/facebookresearch/fastText.git
# ! pip3 install fastText/.

In [14]:
# import fasttext

# ft_model = fasttext.train_supervised(
#     input=data_path+'lenta_train_title_text_lemma.txt',
#     label='__label__',
#     lr=0.5,
#     epoch=25,
#     wordNgrams=2, 
#     dim=200,
#     thread=2,
#     verbose=3000
# )

# ft_model.save_model(data_path+'lenta_model_title_text_lemma.bin')

In [15]:
def predict(test_texts, test_label, selected_model, desc='Unknown parameters model'):
    predicted_labels = [int(selected_model.predict(sentence)[0][0][9:]) for sentence in test_texts]
    test_label = [int(label) for label in test_label]
    
    print(desc, ' => predictions')
    print("Accuracy score: ", accuracy_score(test_label, predicted_labels))
    print("Precision score: ", precision_score(test_label, predicted_labels, average='weighted'))
    print("Recall score: ", recall_score(test_label, predicted_labels, average='weighted'))
    print("f1-score: ", f1_score(test_label, predicted_labels, average='weighted'))

    report = classification_report(test_label, predicted_labels)
    print("\nClassification Report:\n", report)

In [16]:
import fasttext
saved_model = fasttext.load_model(data_path+'lenta_model_title_text_lemma.bin')

In [17]:
predict(X_test, y_test, saved_model, desc='Lemma on title and text columns, fastText')

Lemma on title and text columns, fastText  => predictions
Accuracy score:  0.83914
Precision score:  0.8379081641473527
Recall score:  0.83914
f1-score:  0.8376613548460898

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.85      0.83      9158
           1       0.85      0.86      0.85      3559
           2       0.88      0.90      0.89      3748
           3       0.77      0.63      0.69      1390
           4       0.82      0.85      0.83     10937
           5       0.97      0.96      0.97      4252
           6       0.66      0.53      0.59       552
           7       0.81      0.67      0.73       428
           8       0.86      0.86      0.86      3564
           9       0.89      0.82      0.86      1427
          10       0.86      0.87      0.87      5364
          11       0.79      0.75      0.77      3030
          12       0.71      0.62      0.66      1916
          13       0.91      0.84      0.87  

In [18]:
splitted_X_train = [sentence.split() for sentence in X_train]

In [19]:
%%time
from gensim.models import word2vec

# print("Model training time ... ")

# model_word2vec = word2vec.Word2Vec(
#     splitted_X_train, 
#     workers=2, 
#     vector_size=100, 
#     min_count=10, 
#     window=5, 
#     sg=1, # skip gram
#     sample=1e-3
# )

# model_word2vec.save('data/model_word2vec.model')

CPU times: user 78.5 ms, sys: 10.1 ms, total: 88.6 ms
Wall time: 115 ms


In [21]:
model_word2vec = word2vec.Word2Vec.load('data/model_word2vec.model')

In [22]:
from nltk import FreqDist
from tqdm import tqdm_notebook as tqdm
from sklearn.manifold import TSNE

top_words = []

fd = FreqDist()
for sentence in tqdm(splitted_X_train):
    fd.update(sentence)

for w in fd.most_common(1000):
    top_words.append(w[0])

print(top_words[:50:])
top_words_vec = [model_word2vec.wv[word] for word in top_words if word in model_word2vec.wv]

  0%|          | 0/150000 [00:00<?, ?it/s]

['«', '»', 'год', 'который', '—', 'сообщать', 'россия', 'также', 'свой', 'время', 'заявить', 'компания', 'это', 'стать', 'российский', 'слово', 'президент', 'человек', 'новый', 'тысяча', 'процент', 'первый', 'страна', 'один', 'миллион', 'получить', 'однако', 'дело', 'сша', 'ранее', 'представитель', 'сообщить', 'данные', 'глава', 'москва', 'такой', 'года.', 'около', 'быть', 'результат', 'находиться', 'являться', 'суд', 'должный', 'доллар', 'место', 'отметить', 'несколько', 'решение', 'власть']


In [23]:
%%time
import numpy as np

tsne = TSNE(n_components=2, random_state=0)
top_words_tsne = tsne.fit_transform(np.array(top_words_vec))

CPU times: user 5.14 s, sys: 1.33 s, total: 6.47 s
Wall time: 4.75 s


In [24]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE (eng model, top1000 words)")

source = ColumnDataSource(data=dict(x1=top_words_tsne[:,0],
                                    x2=top_words_tsne[:,1],
                                    names=top_words))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)

In [25]:
model_word2vec.wv['путин'].shape

(100,)

In [26]:
model_word2vec.vector_size

100

In [27]:
def vectorize(tokens):
    token_vectors = np.array([model_word2vec.wv[token] for token in tokens if token in model_word2vec.wv])
    return token_vectors.mean(axis=0)

In [30]:
vectorize(X_train[0])

array([-9.46438238e-02,  5.05475067e-02,  4.44313809e-02,  2.79136568e-01,
        4.46579382e-02, -3.37810427e-01,  3.54172289e-02,  3.72418135e-01,
        2.32384086e-01, -3.16601712e-04, -1.19529799e-01, -4.14966077e-01,
       -7.77244270e-02,  1.93555042e-01, -1.38878986e-01, -2.35462695e-01,
        5.59206307e-02, -2.56707132e-01,  1.59904629e-01, -1.48445427e-01,
       -7.68938437e-02, -1.28714442e-01,  3.46696787e-02, -8.96631852e-02,
        2.58179873e-01,  7.68177882e-02, -3.90064344e-02, -5.37783168e-02,
       -8.89691934e-02,  1.00227706e-01,  2.57178247e-01,  6.64872006e-02,
       -1.63667992e-01,  1.47566218e-02, -6.72580376e-02,  1.80502534e-01,
        8.08745623e-02, -4.02787596e-01,  1.03931315e-01, -3.00442100e-01,
       -8.10302049e-02, -2.56163269e-01,  1.92682907e-01, -1.40972376e-01,
        9.42158103e-02,  1.19684383e-01, -4.27272111e-01,  2.20154241e-01,
        1.33057805e-02,  1.84055179e-01, -1.49033770e-01, -3.56741279e-01,
       -2.67898142e-01, -