In [1]:
import re
import numpy as np
import pandas as pd
import dill

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath

from razdel import tokenize

import pymorphy2

from sklearn.model_selection import train_test_split

from ipykernel import kernelapp as app

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/macbook/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
stopwords_ru = stopwords.words('russian')

In [3]:
with open('stopwords.txt') as f_sw:
    additional_stopwords = [w.strip() for w in f_sw.readlines() if w]
    
stopwords_ru += additional_stopwords

In [4]:
morph = pymorphy2.MorphAnalyzer()

In [5]:
def clean_text(text):
    """
    Очистка текста
    
    """
    
    if not isinstance(text, str):
        text = str(text)
        
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    
    text = re.sub('-\s\r\n\|-\s\r\n|\r\n', '', str(text))
    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    return text

In [6]:
cache = {}

def lemmatization(text):
    
    if not isinstance(text, str):
        text = str(text)
        
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]
    
    words_lem = []
    
    for w in words:
        if w[0] == '-':
            w = w[1:]
        
        if len(w) > 1:
            if w in cache:
                words_lem.append(cache[w])
                
            else:
                temp_cache = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cache)
                
    words_lem_without_stopwords = [i for i in words_lem if not i in stopwords_ru]
    
    return words_lem_without_stopwords

In [7]:
news = pd.read_csv('lenta_ru_news.csv')
news.head()

Unnamed: 0,doc_id,title,target
0,1,Роспотребнадзор раскрыл пути проникновения нов...,Общество
1,2,Российский инфекционист оценил обнаружение «хо...,Общество
2,3,В Крыму нашли способ обойти водную блокаду В К...,Общество
3,4,В Госдуме ответили на предложение Зюганова вве...,Общество
4,5,Самолет экстренно сел в российском аэропорту и...,Происшествия


In [8]:
news['title'] = news['title'].apply(clean_text)

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


In [9]:
news['title'] = news['title'].apply(lemmatization)

In [10]:
targets = pd.read_csv('target.csv')
targets.head()

Unnamed: 0,Id,target
0,0,Общество
1,1,Происшествия
2,2,Россия
3,3,Мнения
4,4,Мир


In [11]:
# Сформируем список наших текстов, разбив еще и на пробелы
texts = [i for i in news['title'].values]

# Создаем corpus для списка текстов
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

#### Обучаем LDA модель

In [12]:
lda = LdaModel(common_corpus, num_topics=38, id2word=common_dictionary)

In [13]:
# Сохраним модель
temp_file = datapath("model.lda")
lda.save(temp_file)

# Загрузим модель
lda = LdaModel.load(temp_file)

In [14]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(38):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

#### Преобразуем список текста в векторное представление

In [15]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(38)]
topic_matrix['target'] = news['target'].values
topic_matrix = topic_matrix[['topic_{}'.format(i) for i in range(38)] + ['target']]
topic_matrix.head(5)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_29,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35,topic_36,topic_37,target
0,0.0,0.0,0.156265,0.0,0.154004,0.0,0.0,0.0,0.0,0.156274,...,0.0,0.0,0.0,0.0,0.0,0.155345,0.0,0.0,0.0,Общество
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.374133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Общество
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.154484,0.0,0.0,0.0,Общество
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.309735,0.0,0.0,0.156184,0.154067,0.0,0.0,0.0,Общество
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.119657,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.628155,Происшествия


In [16]:
X_test = topic_matrix.loc[topic_matrix['target'] == 'Неизвестно', :'topic_37']
# X_test.to_csv('X_test', index=None)

In [17]:
topic_matrix = pd.merge(topic_matrix, targets).drop('target', axis=1)
topic_matrix = topic_matrix.rename(columns={'Id': 'target'})
topic_matrix.head(3)

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_29,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35,topic_36,topic_37,target
0,0.0,0.0,0.156265,0.0,0.154004,0.0,0.0,0.0,0.0,0.156274,...,0.0,0.0,0.0,0.0,0.0,0.155345,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.374133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.154484,0.0,0.0,0.0,0


#### Делим датасет на тренировноый и валидационный

In [18]:
X = topic_matrix.drop('target', axis=1)
y = topic_matrix['target']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

#save test
X_valid.to_csv("X_valid.csv", index=None)
y_valid.to_csv("y_valid.csv", index=None)
#save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)