In [24]:
import pandas as pd
import nltk
from tqdm.notebook import tqdm
import pymorphy2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from operator import itemgetter

clusters_num = 20
df = pd.read_excel('Новости retail ru.xlsx')
texts = list(df['Текст новости'])

tokkens_lists = []
for sentence in tqdm(texts):
    tokkens_lists.append(nltk.word_tokenize(sentence))
    
morph = pymorphy2.MorphAnalyzer()
teg_list = ['NOUN','ADJF','VERB','INFN']
tokkens_lem_lists = []
for tokkens_list in tqdm(tokkens_lists):
    tokkens_lem_list = []
    for token in tokkens_list:
        token_lem = morph.parse(token)
        if token_lem[0].tag.POS in teg_list:
            tokkens_lem_list.append(token_lem[0].normal_form)
    tokkens_lem_lists.append(tokkens_lem_list)   
    
texts_lem_lists = []
for sentece_lem_list in tqdm(tokkens_lem_lists):
    texts_lem_lists.append(' '.join(sentece_lem_list))    
    
tfidfconverter = TfidfVectorizer(max_features = 4045)
X = tfidfconverter.fit_transform(texts_lem_lists).toarray()

lda = LatentDirichletAllocation(n_components = clusters_num,
                                learning_method = "batch",
                                max_iter = 25,
                                random_state = 0)
document_topics = lda.fit_transform(X)

sorting = np.argsort(lda.components_, axis = 1)[:, ::-1]
feature_names = np.array(list(dict(sorted(tfidfconverter.vocabulary_.items(), key = itemgetter(1))).keys()))   
clusters_tokens = []
for num in tqdm(range (0, clusters_num)):
    tokens_list = list(feature_names[list(sorting[num])])
    clusters_tokens.append(tokens_list)   
df_clust = pd.DataFrame(clusters_tokens).T

df_clust.to_excel('Кластеры.xlsx')

  0%|          | 0/4045 [00:00<?, ?it/s]

  0%|          | 0/4045 [00:00<?, ?it/s]

  0%|          | 0/4045 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

In [25]:
clusters_norm_nums = [0,1,3,7,8,12,18,19]

clusters_solution = []
for marks_list in tqdm(document_topics):
    clusters_solution.append(marks_list[clusters_norm_nums])
y = []
for marks_list in tqdm(clusters_solution):
    m = list(marks_list)
    y.append(m.index(max(m)))


  0%|          | 0/4045 [00:00<?, ?it/s]

  0%|          | 0/4045 [00:00<?, ?it/s]

In [11]:
y

[2,
 1,
 0,
 0,
 1,
 3,
 3,
 1,
 1,
 0,
 1,
 0,
 2,
 1,
 2,
 0,
 3,
 1,
 1,
 3,
 3,
 1,
 1,
 3,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 3,
 2,
 0,
 1,
 1,
 2,
 1,
 0,
 1,
 1,
 3,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 3,
 3,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 2,
 1,
 1,
 1,
 1,
 0,
 0,
 3,
 0,
 1,
 0,
 0,
 1,
 0,
 3,
 1,
 1,
 1,
 2,
 1,
 3,
 0,
 1,
 1,
 3,
 0,
 1,
 1,
 3,
 2,
 2,
 1,
 2,
 3,
 2,
 2,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 3,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 3,
 2,
 0,
 0,
 0,
 0,
 3,
 1,
 1,
 1,
 0,
 1,
 0,
 2,
 0,
 1,
 0,
 0,
 2,
 1,
 0,
 0,
 0,
 2,
 2,
 1,
 3,
 1,
 3,
 3,
 3,
 1,
 3,
 0,
 3,
 0,
 3,
 0,
 0,
 3,
 1,
 0,
 3,
 0,
 1,
 1,
 2,
 0,
 2,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 3,
 1,
 0,
 1,
 0,
 3,
 0,
 1,
 0,
 3,
 0,
 2,
 3,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 3,
 1,
 1,
 3,
 1,
 3,
 0,
 3,
 1,
 0,
 3,
 2,
 2,
 2,
 1,
 1,
 1,
 0,
 0,
 1,
 3,
 0,
 1,
 1,
 2,
 0,
 2,
 3,
 2,
 0,
 1,
 0,
 0,
 2,
 1,
 1,
 2,
 1,
 1,
 2,
 3,
 0,
 3,
 1,
 0,
 2,
 0,
 0,
 1,
 1,
 1,


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

classifier = RandomForestClassifier(n_estimators = 1000, random_state = 0 )
classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=0)

In [27]:
y_pred = classifier.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.6093943139678616


In [28]:
scores = classifier.predict_proba(tfidfconverter.fit_transform(texts_lem_lists).toarray())

df_lists = []
index = 0
for t in texts:
    df_lists.append(list(np.append(t,scores[index])))
    index += 1
pd.DataFrame(df_lists).to_excel('Присутствие_кластеров.xlsx')

In [21]:
len(tokkens_lem_lists)

4035