### Тематическое моделирование статей Википедии с использованием библиотеки Gensim

In [None]:
pip install wikipedia
pip install pyLDAvis

In [1]:
import wikipedia
import nltk

#### Скачиваем английские стоп-слова

In [2]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ruseel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Создаем корпус текстов из 4 статей Wikipedia

In [3]:
global_warming = wikipedia.page("Climate change")
artificial_intelligence = wikipedia.page("Artificial Intelligence")
eiffel_tower = wikipedia.page("Eiffel Tower")
football = wikipedia.page("Football")

corpus = [global_warming.content, artificial_intelligence.content, eiffel_tower.content, football.content]


In [4]:
corpus

 'Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to natural intelligence displayed by animals including humans. Leading AI textbooks define the field as the study of "intelligent agents": any system that perceives its environment and takes actions that maximize its chance of achieving its goals.Some popular accounts use the term "artificial intelligence" to describe machines that mimic "cognitive" functions that humans associate with the human mind, such as "learning" and "problem solving", however, this definition is rejected by major AI researchers.AI applications include advanced web search engines (e.g., Google), recommendation systems (used by YouTube, Amazon and Netflix), understanding human speech (such as Siri and Alexa), self-driving cars (e.g., Tesla), automated decision-making and competing at the highest level in strategic game systems (such as chess and Go).\nAs machines become increasingly capable, tasks considered to require "intelligen

#### Этап предобработки данных

In [5]:
import re
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word)  > 5]

        return tokens


#### Создаем корпус

In [6]:
import nltk
#nltk.download('wordnet')
#nltk.download('omw-1.4')
processed_data = [];
for doc in corpus:
    tokens = preprocess_text(doc)
    processed_data.append(tokens)

#### Создание словаря и соответствующего пакета слов корпуса

In [7]:
from gensim import corpora

gensim_dictionary = corpora.Dictionary(processed_data)
gensim_corpus = [gensim_dictionary.doc2bow(token, allow_update=True) for token in processed_data]

#### Сохранение словаря и корпуса слов

In [8]:
import pickle

pickle.dump(gensim_corpus, open('gensim_corpus_corpus.pkl', 'wb'))
gensim_dictionary.save('gensim_dictionary.gensim')

#### Построение модели LDA и вывод 10 значимых слов для каждой темы

In [9]:
import gensim

lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary, passes=20)
lda_model.save('gensim_model.gensim')

In [10]:
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.072*"football" + 0.016*"played" + 0.016*"school" + 0.013*"player" + 0.011*"association" + 0.008*"australian" + 0.007*"american" + 0.007*"century" + 0.007*"league" + 0.005*"modern"')
(1, '0.020*"intelligence" + 0.016*"artificial" + 0.015*"original" + 0.015*"archived" + 0.013*"retrieved" + 0.011*"machine" + 0.009*"learning" + 0.008*"problem" + 0.007*"october" + 0.006*"network"')
(2, '0.031*"climate" + 0.024*"change" + 0.016*"warming" + 0.014*"emission" + 0.013*"global" + 0.010*"carbon" + 0.009*"energy" + 0.009*"greenhouse" + 0.008*"country" + 0.007*"temperature"')
(3, '0.026*"eiffel" + 0.007*"second" + 0.006*"french" + 0.005*"structure" + 0.005*"exposition" + 0.005*"tallest" + 0.005*"engineer" + 0.004*"design" + 0.004*"million" + 0.004*"france"')


#### Построение модели LDA и выделение 8 тем

In [12]:
lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=8, id2word=gensim_dictionary, passes=15)
lda_model.save('gensim_model.gensim')
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.000*"intelligence" + 0.000*"artificial" + 0.000*"original" + 0.000*"retrieved" + 0.000*"football"')
(1, '0.022*"intelligence" + 0.018*"artificial" + 0.016*"original" + 0.016*"archived" + 0.015*"retrieved"')
(2, '0.001*"intelligence" + 0.000*"archived" + 0.000*"artificial" + 0.000*"original" + 0.000*"machine"')
(3, '0.000*"football" + 0.000*"intelligence" + 0.000*"original" + 0.000*"archived" + 0.000*"artificial"')
(4, '0.000*"eiffel" + 0.000*"intelligence" + 0.000*"change" + 0.000*"original" + 0.000*"archived"')
(5, '0.000*"football" + 0.000*"original" + 0.000*"intelligence" + 0.000*"eiffel" + 0.000*"artificial"')
(6, '0.023*"climate" + 0.018*"change" + 0.013*"eiffel" + 0.012*"warming" + 0.011*"emission"')
(7, '0.082*"football" + 0.018*"played" + 0.018*"school" + 0.015*"player" + 0.013*"association"')


#### Повторный запуск и сравнение результатов

In [13]:
lda_model = gensim.models.ldamodel.LdaModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary, passes=20)
lda_model.save('gensim_model.gensim')
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.031*"climate" + 0.024*"change" + 0.016*"warming" + 0.014*"emission" + 0.013*"global" + 0.010*"carbon" + 0.009*"energy" + 0.009*"greenhouse" + 0.008*"country" + 0.007*"temperature"')
(1, '0.072*"football" + 0.016*"played" + 0.016*"school" + 0.013*"player" + 0.011*"association" + 0.008*"australian" + 0.007*"american" + 0.007*"century" + 0.007*"league" + 0.005*"modern"')
(2, '0.026*"eiffel" + 0.007*"second" + 0.006*"french" + 0.005*"structure" + 0.005*"exposition" + 0.005*"tallest" + 0.005*"engineer" + 0.004*"design" + 0.004*"million" + 0.004*"france"')
(3, '0.020*"intelligence" + 0.016*"artificial" + 0.015*"original" + 0.015*"archived" + 0.013*"retrieved" + 0.011*"machine" + 0.009*"learning" + 0.008*"problem" + 0.007*"october" + 0.006*"network"')


#### Построение модели LSA

In [14]:
from gensim.models import LsiModel

lsi_model = LsiModel(gensim_corpus, num_topics=4, id2word=gensim_dictionary)
topics = lsi_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '-0.859*"football" + -0.189*"played" + -0.187*"school" + -0.158*"player" + -0.132*"association" + -0.096*"australian" + -0.089*"american" + -0.085*"century" + -0.080*"league" + -0.064*"modern"')
(1, '0.390*"intelligence" + 0.310*"artificial" + 0.284*"original" + 0.278*"archived" + 0.256*"retrieved" + 0.214*"machine" + 0.164*"learning" + 0.154*"problem" + 0.146*"climate" + 0.132*"october"')
(2, '-0.524*"climate" + -0.388*"change" + -0.271*"warming" + -0.238*"emission" + -0.208*"global" + -0.169*"carbon" + -0.155*"energy" + -0.147*"greenhouse" + 0.145*"intelligence" + -0.120*"country"')
(3, '0.689*"eiffel" + 0.194*"second" + 0.161*"french" + 0.145*"exposition" + 0.143*"structure" + 0.128*"tallest" + 0.119*"engineer" + 0.108*"design" + 0.102*"restaurant" + 0.102*"france"')


#### Оценка модели LDA. Тестирование на новых данных.

In [15]:
test_doc = 'Great structures are build to remember an event happened in the history.'
test_doc = preprocess_text(test_doc)
bow_test_doc = gensim_dictionary.doc2bow(test_doc)

print(lda_model.get_document_topics(bow_test_doc))


[(0, 0.08426886), (1, 0.085419625), (2, 0.74513805), (3, 0.08517347)]


#### Оценка модели LDA. Preplexity и Coherence Score. 

In [16]:
print('\nPerplexity:', lda_model.log_perplexity(gensim_corpus))

from gensim.models import CoherenceModel

coherence_score_lda = CoherenceModel(model=lda_model, texts=processed_data, dictionary=gensim_dictionary, coherence='c_v')
coherence_score = coherence_score_lda.get_coherence()

print('\nCoherence Score:', coherence_score)



Perplexity: -7.414657857681082

Coherence Score: 0.7406677801105871


#### Визуализация LDA

In [17]:
gensim_dictionary = gensim.corpora.Dictionary.load('gensim_dictionary.gensim')
gensim_corpus = pickle.load(open('gensim_corpus_corpus.pkl', 'rb'))
lda_model = gensim.models.ldamodel.LdaModel.load('gensim_model.gensim')

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
# pyLDAvis.enable_notebook()
lda_visualization = pyLDAvis.gensim_models.prepare(lda_model, gensim_corpus, gensim_dictionary, sort_topics=False)
pyLDAvis.display(lda_visualization)

  by='saliency', ascending=False).head(R).drop('saliency', 1)
