In [7]:
import pandas as pd
import numpy as np

import nltk
from nltk import tokenize
from nltk.corpus import stopwords

import re
import pymorphy2

import itertools

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV, train_test_split

import matplotlib.pyplot as plt
import seaborn as sns

import pyLDAvis.lda_model

from tqdm import tqdm_notebook
import warnings
import joblib

from utils import *

warnings.filterwarnings('ignore')

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dmitry/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/dmitry/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
df = pd.read_csv('data/topic_modeling_task_sample_trainPart.csv') 

In [9]:
df['replicas'] = df['text_employer'].str.split('.')
data = df[['ucid','replicas']].explode('replicas').reset_index(drop=True).dropna()
data = data[data['replicas'] != '']

In [11]:
stop_words = stopwords.words('russian')
extra_stop_words = get_stopwords_from_file('stopwords.txt')
stop_words_extended = set(stop_words + extra_stop_words)

morph = pymorphy2.MorphAnalyzer()

def process_string(input_string, morph=morph, stop_words=stop_words_extended):
    words = re.findall(r'\b[а-яА-Я_]+\b', input_string)
    result_string = ' '.join([word if '_' in word else morph.parse(word)[0].normal_form for word in words if word not in stop_words])
    
    return result_string

data['replicas'] = data['replicas'].apply(process_string)

In [12]:
data['replicas']

0                                               добрый_день
1         клиентский_менеджер виктория сбер бизнес звони...
2                                                     знать
3         понять ваш предложение который хотеть обсудить...
4         возможность подключить бизнес кэшбэк зарабатыв...
                                ...                        
556210                                       ориентировочно
556211                                               удобно
556212                    какой вопрос другой продукт_банка
556213    понять перезвонить следующий среда ориентирово...
556214                                      добрый свидание
Name: replicas, Length: 556215, dtype: object

In [32]:
X_train, X_test = train_test_split(data.replicas, test_size=0.8, random_state=42)

# LDA

In [33]:
vector_ben = CountVectorizer(
    analyzer='word',
    min_df=10,
    ngram_range=(1, 2),
    stop_words=stopwords.words('russian'),
    # max_features=10000,
)
train_vec_ben = vector_ben.fit_transform(X_train)
test_vec_ben = vector_ben.transform(X_test)
train_vec_ben.shape

(111243, 10712)

In [34]:
lda_model_ben = LatentDirichletAllocation(
        n_components=15,
        learning_method='online',
        random_state=42,
        max_iter=10,
        n_jobs=4,
        verbose=1
    )

with joblib.parallel_backend(backend='loky', n_jobs=4):
    lda_model_ben.fit(train_vec_ben)

iteration: 1 of max_iter: 10
iteration: 2 of max_iter: 10
iteration: 3 of max_iter: 10
iteration: 4 of max_iter: 10
iteration: 5 of max_iter: 10
iteration: 6 of max_iter: 10
iteration: 7 of max_iter: 10
iteration: 8 of max_iter: 10
iteration: 9 of max_iter: 10
iteration: 10 of max_iter: 10


In [38]:
# pyLDAvis.enable_notebook()
# panel = pyLDAvis.lda_model.prepare(
#     lda_model_ben,
#     train_vec_ben,
#     vector_ben,
#     mds='tsne'
# )
# panel