## Задание

1. Мы будем работать с (частичными) данными lenta.ru отсюда: https://www.kaggle.com/yutkin/corpus-of-russian-news-articles-from-lenta/
2. Проведите препроцессинг текста. Разбейте данные на train и test для задачи классификации (в качестве метки класса будем использовать поле topic). В качестве данных для классификации в пунктах 3 и 5 возьмите
    - только заголовки (title)
    - только тексты новости (text)
    - и то, и другое
3. Обучите fastText для классификации текстов по темам. Сравните качество для разных данных из п. 2.
4. Обучите свою модель w2v (или возьмите любую подходящую предобученную модель). Реализуйте функцию для вычисления вектора текста / заголовка / текста+заголовка как среднего вектора входящих в него слов. 
     - (Бонус) Модифицируйте функцию вычисления среднего вектора: взвешивайте вектора слов соответствующими весами tf-idf.
5. Обучите на полученных средних векторах алгоритм классификации, сравните полученное качество с классификатором fastText. 

In [1]:
# !kaggle datasets download -d yutkin/corpus-of-russian-news-articles-from-lenta
# !unzip data/corpus-of-russian-news-articles-from-lenta.zip -d data/

In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [3]:
data_path = 'data/'

lenta = pd.read_csv(data_path + 'lenta-ru-news.csv', usecols=['title', 'text', 'topic'])
lenta = lenta[lenta['topic'].notna()]

lenta = lenta.sample(frac=1, random_state=42)

In [4]:
lenta.shape

(738973, 3)

In [5]:
label_dict = {}

for i, topic in enumerate(lenta['topic'].unique()):
    label_dict[topic] = i

lenta['label'] = lenta['topic'].apply(lambda x: label_dict[x])

label_dict

{'Мир': 0,
 'Наука и техника': 1,
 'Культура': 2,
 'Силовые структуры': 3,
 'Россия': 4,
 'Спорт': 5,
 'Бизнес': 6,
 'Путешествия': 7,
 'Бывший СССР': 8,
 'Дом': 9,
 'Экономика': 10,
 'Интернет и СМИ': 11,
 'Из жизни': 12,
 'Ценности': 13,
 'Культпросвет ': 14,
 '69-я параллель': 15,
 'Крым': 16,
 'Библиотека': 17,
 'Легпром': 18,
 'Оружие': 19,
 'МедНовости': 20,
 'ЧМ-2014': 21,
 'Сочи': 22}

## Препроцессинг
Токенизация, чистка от пунктуации и стопвордов

Запись в .txt в виде __label__i для fasttex

In [6]:
# from nltk import tokenize

# tokenizer = tokenize.NLTKWordTokenizer()

In [7]:
# import nltk
# nltk.download('stopwords')

# from nltk.corpus import stopwords
# from string import punctuation

# noise = stopwords.words('russian') + list(punctuation)
# splitters = ['\'\'', '``', '\"', '-', '\'', '\`']

In [8]:
# import pymorphy3
# morph = pymorphy3.MorphAnalyzer()

# def morphling_lemmatizer(word):
#     parsed_word = morph.parse(word)[0]
#     lemma = parsed_word.normal_form
#     return lemma

# morphling_lemmatizer('деревьев')

In [9]:
# from nltk.stem import SnowballStemmer

# snowball_stemmer = SnowballStemmer("russian")

# def ru_stemmer(word):
#     return snowball_stemmer.stem(word)

# ru_stemmer('денег')

In [10]:
# def preprocess(sentence): 
#     for splitter in splitters:
#         sentence = sentence.replace(splitter, ' ')  
#     tokens = tokenizer.tokenize(sentence.lower())  
#     clean_tokens = [token.strip() for token in tokens if token not in noise]
#     lemma_tokens = [morphling_lemmatizer(token) for token in clean_tokens]  
#     # stemmed_tokens = [ru_stemmer(token) for token in clean_tokens]
#     return ' '.join(lemma_tokens)

# preprocess('как открыть карты')

In [11]:
# def preprocess_and_save():
#     title_preprocessed = [preprocess(str(sentence)) for sentence in tqdm(lenta['title'], desc='Preprocessing titles ...')]
#     text_preprocessed = [preprocess(str(sentence)) for sentence in tqdm(lenta['text'], desc='Preprocessing text ...')]

#     preprocessed_lenta = pd.DataFrame({
#         'title' : title_preprocessed,
#         'text' : text_preprocessed,
#         'label' : lenta['label']
#     })

#     preprocessed_lenta.to_csv(data_path + 'preprocessed_lenta.csv')

In [90]:
def get_test_and_save_train(X_df, y_df, train_path, size):
    X = X_df.astype(str).values.tolist()
    y = y_df.astype(str).values.tolist()
    
    X_train, X_test, y_train, y_test = train_test_split(X[:size], y[:size], test_size=0.25, random_state=42, shuffle=True)

    with open(data_path + train_path + '.txt', 'w', encoding='utf-8') as file:
        for X_entry, y_entry in zip(X_train, y_train):
            X_entry = str(X_entry).replace('\n', ' ').replace('\r', ' ')
            file.write('__label__' + str(y_entry) + ' ' + X_entry)
            file.write('\n')

    return X_train, y_train, X_test, y_test

preprocessed_lenta = pd.read_csv(data_path+'preprocessed_lenta.csv', index_col=False)

X_train, y_train, X_test, y_test = get_test_and_save_train(
    preprocessed_lenta['text'], 
    preprocessed_lenta['label'], 
    train_path='lenta_train_title_text_lemma',
    size=100_000
)

## FastText

In [91]:
# ! git clone https://github.com/facebookresearch/fastText.git
# ! pip3 install fastText/. 

In [92]:
import fasttext

# ft_model = fasttext.train_supervised(
#     input=data_path+'lenta_train_title_text_lemma.txt',
#     label='__label__',
#     lr=0.5,
#     epoch=25,
#     wordNgrams=2, 
#     dim=200,
#     thread=2,
#     verbose=3000
# )

# ft_model.save_model(data_path+'lenta_model_title_text_lemma.bin')

In [93]:
def print_metrics(true_and_predicted_labels):
    true_labels, predicted_labels = true_and_predicted_labels

    print("Accuracy score: ", accuracy_score(true_labels, predicted_labels))
    print("Precision score: ", precision_score(true_labels, predicted_labels, average='weighted'))
    print("Recall score: ", recall_score(true_labels, predicted_labels, average='weighted'))
    print("f1-score: ", f1_score(true_labels, predicted_labels, average='weighted'))

    # report = classification_report(true_labels, predicted_labels)
    # print("\nClassification Report:\n", report)

def predict_fasttext(test_texts, test_label, selected_model):
    predicted_labels = [int(selected_model.predict(sentence)[0][0][9:]) for sentence in test_texts]
    test_label = [int(label) for label in test_label]
    
    return test_label, predicted_labels

In [94]:
model_fasttext = fasttext.load_model(data_path+'lenta_model_title_text_lemma.bin')

In [95]:
print_metrics(predict_fasttext(X_test, y_test, model_fasttext))

Accuracy score:  0.8364
Precision score:  0.8355563490686044
Recall score:  0.8364
f1-score:  0.8351049900817501


## Word2Vec

In [96]:
splitted_X_train = [sentence.split() for sentence in X_train]
splitted_X_test = [sentence.split() for sentence in X_test]

y_train = [int(label) for label in y_train]
y_test = [int(label) for label in y_test]

In [97]:
%%time
from gensim.models import word2vec

# model_word2vec = word2vec.Word2Vec(
#     splitted_X_train, 
#     workers=2, 
#     vector_size=100, 
#     min_count=10, 
#     window=5, 
#     sg=1, # skip gram
#     sample=1e-3
# )

# model_word2vec.save('data/model_word2vec.model')

CPU times: user 4 μs, sys: 2 μs, total: 6 μs
Wall time: 7.87 μs


In [98]:
model_word2vec = word2vec.Word2Vec.load('data/model_word2vec.model')

In [99]:
from nltk import FreqDist
from tqdm import tqdm_notebook as tqdm
from sklearn.manifold import TSNE

# top_words = []

# fd = FreqDist()
# for sentence in tqdm(splitted_X_train):
#     fd.update(sentence)

# for w in fd.most_common(1000):
#     top_words.append(w[0])

# print(top_words[:50:])
# top_words_vec = [model_word2vec.wv[word] for word in top_words if word in model_word2vec.wv]

In [100]:
# %%time
# import numpy as np

# tsne = TSNE(n_components=2, random_state=0)
# top_words_tsne = tsne.fit_transform(np.array(top_words_vec))

In [101]:
# from bokeh.models import ColumnDataSource, LabelSet
# from bokeh.plotting import figure, show, output_file
# from bokeh.io import output_notebook
# output_notebook()

# p = figure(tools="pan,wheel_zoom,reset,save",
#            toolbar_location="above",
#            title="word2vec T-SNE (eng model, top1000 words)")

# source = ColumnDataSource(data=dict(x1=top_words_tsne[:,0],
#                                     x2=top_words_tsne[:,1],
#                                     names=top_words))

# p.scatter(x="x1", y="x2", size=8, source=source)

# labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
#                   text_font_size="8pt", text_color="#555555",
#                   source=source, text_align='center')
# p.add_layout(labels)

# show(p)

## Tf-Idf mean vector

In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer()

vectorizer_tfidf.fit(X_train)

In [103]:
weight_matrix = vectorizer_tfidf.transform(X_train)

word_weights = weight_matrix.mean(axis=0).tolist()[0]

In [104]:
def get_weight(word):
    if word not in vectorizer_tfidf.vocabulary_:
        return 0.0  
    word_index = vectorizer_tfidf.vocabulary_[word]
    return word_weights[word_index]

In [105]:
get_weight('мир')

0.004982001534808342

In [106]:
def vectorize(tokens, type='mean'):
    token_vectors = np.array([model_word2vec.wv[token] for token in tokens if token in model_word2vec.wv])
    token_weights = np.array([get_weight(token) for token in tokens if token in model_word2vec.wv])

    vector_dim = model_word2vec.vector_size

    if token_vectors.shape[0] == 0:
        return np.zeros(vector_dim)
    
    if type == 'weighted':
        weighted_mean_vector = np.array([token_vector * weight for token_vector, weight in zip(token_vectors, token_weights)])
        return weighted_mean_vector.sum(axis=0)

    return token_vectors.mean(axis=0)

In [107]:
vectorize(['русский', 'мир'], type='weighted')

array([-8.9601835e-04,  2.9487943e-04, -5.6206476e-04,  4.1954666e-03,
        6.3481735e-04, -3.6877389e-03,  9.5972075e-04,  1.9071362e-05,
        1.6018945e-03,  9.5596036e-04, -2.9662726e-03, -4.8732697e-03,
       -8.5602811e-04,  2.5713143e-03,  1.5966662e-03, -2.5811195e-03,
        1.9772132e-03,  5.4687238e-04,  1.5672740e-03, -4.0422252e-04,
        9.6653844e-04, -1.8992596e-03,  1.8783676e-03,  8.2488242e-04,
        1.2315471e-03, -3.5024327e-04,  5.7304639e-04,  1.1081567e-03,
       -1.4391242e-03,  2.9365183e-05,  2.9193771e-03, -1.4995056e-03,
        6.6156616e-04,  8.6261856e-04, -1.2102296e-03,  2.4555195e-03,
        3.4409339e-04, -2.5471859e-03, -1.0210541e-03, -1.5783029e-03,
        3.5612455e-03, -3.3260044e-03,  2.8702451e-04,  1.8084520e-03,
        1.7115693e-03,  5.8625237e-04,  1.3293879e-04,  1.0802443e-03,
       -1.9534554e-03,  8.4588537e-04, -7.6607372e-05, -1.3060172e-03,
       -1.5686687e-04, -5.5341111e-03, -1.5251795e-03,  1.1842126e-03,
      

In [108]:
vectorize(['русский', 'мир'], type='mean')

array([-0.31860754,  0.06830493,  0.040143  ,  0.66430634,  0.08700847,
       -0.40724123,  0.01372415, -0.04465154,  0.16026951, -0.01157925,
       -0.31812578, -0.6462939 , -0.09814704,  0.43068326,  0.2763589 ,
       -0.3749168 ,  0.24966732,  0.14633267,  0.1801853 , -0.11125371,
        0.09321623, -0.08896002,  0.21702746,  0.12206404,  0.12936826,
       -0.03780378,  0.10421629,  0.09504369, -0.1656234 ,  0.10033531,
        0.33208683, -0.14149038,  0.17004098,  0.09397253, -0.09832336,
        0.34271854,  0.08693873, -0.22405119, -0.05117177, -0.2142378 ,
        0.41478652, -0.43254495,  0.0340953 ,  0.30622545,  0.15119542,
       -0.01600926,  0.09390956,  0.187486  , -0.19516206,  0.0901229 ,
        0.00650435, -0.24489096, -0.03387221, -0.6635891 , -0.2622139 ,
        0.1806831 ,  0.43603146, -0.02065751, -0.14719366,  0.0855608 ,
       -0.37081704, -0.18898991,  0.37299782, -0.20646967, -0.46013147,
       -0.326745  ,  0.3541817 ,  0.6195999 , -0.18287213,  0.23

In [109]:
get_weight('путин')

0.0042739132403748405

In [110]:
vectorize(['путин', 'убить'], type='weighted')

array([ 1.75642385e-03, -4.08613036e-04,  1.62460923e-03,  5.54849394e-04,
        3.67448461e-04,  1.27185252e-04,  8.27361539e-04,  1.82603090e-03,
       -9.86123225e-04, -2.57253437e-03, -3.76478070e-04, -2.83349329e-03,
       -1.57442887e-03,  4.09550779e-03, -2.39287619e-04, -1.94687594e-03,
        3.81215941e-05,  1.44836365e-03, -3.36532108e-03, -2.23176787e-03,
        2.20408174e-03,  2.18900316e-03,  1.44213752e-03,  7.59169692e-04,
        2.80126045e-03,  2.26212409e-03, -4.51708387e-04, -3.16560594e-03,
       -1.91427011e-03, -7.35129288e-04,  5.49515360e-04,  3.69246205e-04,
        2.01386312e-04, -2.78596324e-03,  1.56834419e-03,  1.93321100e-03,
       -2.84271361e-03, -1.91952800e-03, -2.23401003e-03, -1.02809549e-03,
        1.27636094e-03, -9.52046190e-04, -1.21822837e-03,  9.85855935e-04,
        1.24202401e-03, -1.62067835e-03, -7.70929386e-04, -3.60589300e-04,
        3.10172071e-03,  6.80996571e-04, -2.40971660e-03, -2.70722015e-03,
       -3.04865011e-04,  

## Логистическая регрессия

In [111]:
from sklearn.linear_model import LogisticRegression

model_logreg = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs'
)

In [115]:
X_train_vectors = [vectorize(sentence, type='mean') for sentence in tqdm(splitted_X_train, desc='Vectorizing train sentences ...')]
X_test_vectors = [vectorize(sentence, type='mean') for sentence in tqdm(splitted_X_test, desc='Vectorizing test sentences ...')]

Vectorizing train sentences ...:   0%|          | 0/75000 [00:00<?, ?it/s]

Vectorizing test sentences ...:   0%|          | 0/25000 [00:00<?, ?it/s]

In [116]:
model_logreg.fit(X_train_vectors, y_train)

In [117]:
print_metrics((y_test, model_logreg.predict(X_test_vectors).tolist()))

Accuracy score:  0.76384
Precision score:  0.7575163524722448
Recall score:  0.76384
f1-score:  0.7572917138196508


## XGBoost

In [34]:
import xgboost as xgb

model_xgb = xgb.XGBClassifier()

In [35]:
model_xgb.fit(X_train_vectors, y_train)

model_xgb.save_model(data_path+'model_xgb.json')

In [36]:
model_xgb = xgb.XGBClassifier()
model_xgb.load_model(data_path+'model_xgb.json')

In [37]:
print_metrics((y_test, model_xgb.predict(X_test_vectors).tolist()))

Accuracy score:  0.7676
Precision score:  0.7631943578226807
Recall score:  0.7676
f1-score:  0.7612159310618639
