In [2]:
# Scrape a reduced amount of data from the web using Python client for Wikipedia's API.
import wikipedia

page_titles = ['Python (programming language)', 'Natural language processing', 'Geography', 
               'History', 'Knowledge', 'Science', 'Art', 'Politics', 'War', 'Food', 'Mathematics']

page_summaries = []
for page_title in page_titles:
    page_summaries.append(wikipedia.WikipediaPage(title=page_title).summary)

In [3]:
print('Example of a summary:\n"{}..."'.format(page_summaries[0][0:250]))

Example of a summary:
"Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language co..."


In [4]:
# Tokenize summaries. This includes downloading the relevant pre-proccessing elements for NLTK.
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/christo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/christo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

stop_words = set(stopwords.words('english'))
lemma = WordNetLemmatizer()
porter = PorterStemmer()

def summary_preprocessing(summary: str) -> list:
    '''
    Performs preprocessing including lower-casing, dropping non-alphabetic chars,  
    dropping stopwords, stemming and lemmatizing.
    '''
    words = nltk.word_tokenize(summary)
    words = [word for word in words if word.isalpha()]
    words = [w for w in words if not w in stop_words]
    words = [porter.stem(word) for word in words]
    words = " ".join(lemma.lemmatize(word) for word in words)
    return words

tokanized_page_summaries = []
for summary in page_summaries:
    tokanized_page_summaries.append(summary_preprocessing(summary))

In [6]:
print('Example of a tokanized summary:\n{}...'.format(tokanized_page_summaries[0][0:100]))

Example of a tokanized summary:
python interpret program languag creat guido van rossum first releas python design philosophi emphas...


In [7]:
# Transfrom summaries into a summary-word matrix
import gensim
from gensim import corpora

processed_summaries = [summary_preprocessing(summary).split() for summary in page_summaries]        
dictionary = corpora.Dictionary(processed_summaries)
summary_word_matrix = [dictionary.doc2bow(summary) for summary in processed_summaries]

In [8]:
# Use LDA to create summary-topic & topic-word distributions.
lda_model = gensim.models.ldamodel.LdaModel(summary_word_matrix, num_topics=3, id2word = dictionary, passes=50)

In [9]:
# Print some topic-word lists with weightings. 
# Reference to below 2 lines: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
for idx, words in lda_model.print_topics(-1):
    print('Topic {}: {}'.format(idx, words))

Topic 0: 0.028*"art" + 0.028*"food" + 0.011*"right" + 0.009*"human" + 0.009*"the" + 0.009*"skill" + 0.009*"organ" + 0.007*"gener" + 0.007*"intern" + 0.007*"world"
Topic 1: 0.018*"polit" + 0.017*"scienc" + 0.017*"studi" + 0.012*"histori" + 0.011*"natur" + 0.010*"python" + 0.009*"the" + 0.008*"geographi" + 0.007*"disciplin" + 0.007*"use"
Topic 2: 0.028*"mathemat" + 0.015*"natur" + 0.014*"knowledg" + 0.012*"war" + 0.010*"studi" + 0.010*"languag" + 0.007*"understand" + 0.007*"process" + 0.007*"practic" + 0.007*"gener"


In [10]:
# Test a summary of a page is a ~related field to the training data. 
test_summary_page =  wikipedia.WikipediaPage(title='Cluster analysis').summary
# Reference to below 3 lines: https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24
bow_vector = dictionary.doc2bow(summary_preprocessing(test_summary_page).split())
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6019821763038635	 Topic: 0.028*"mathemat" + 0.015*"natur" + 0.014*"knowledg" + 0.012*"war" + 0.010*"studi"
Score: 0.39065396785736084	 Topic: 0.018*"polit" + 0.017*"scienc" + 0.017*"studi" + 0.012*"histori" + 0.011*"natur"


In [None]:
'''
In conclusion, the topic of 'Cluster analysis' is highly related to the suggested topic, classified by words associated with "mathematics".
'''