In [1]:
from gensim.models.word2vec import Word2Vec
from gensim.models import LdaModel, LdaMulticore
from gensim import corpora
from gensim.models import CoherenceModel

import gensim.downloader as api
import gensim

import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import re

import pandas as pd
stop_words = stopwords.words('english')
stop_words = stop_words + ['com', 'edu', 'subject', 'lines', 'organization', 'would', 'article', 'could']
punctuations="?:!.,;"

In [2]:
dataset = api.load('text8')
data = [d for d in dataset]

print("dataset contains n {} documents".format(len(data)))


dataset contains n 1701 documents


In [4]:
wordnet_lemmatizer = WordNetLemmatizer()

def tokenize(doc): 
    return word_tokenize(doc)

def stemSentence(token_words):
    stem_sentence=[]
    for word in token_words:
        if word not in stop_words and word not in punctuations: 
            stem_sentence.append(wordnet_lemmatizer.lemmatize(word))
        else: 
            continue
    return stem_sentence

def preprocessing(data): 
    data_processed = []

    for i, doc in enumerate(data):
        data_processed.append(stemSentence(doc))
    return data_processed

In [5]:
processed_data = preprocessing(data)

In [6]:
print(processed_data[0][:100]) 
print("processed dataset contains n {} documents".format(len(processed_data)))

['anarchism', 'originated', 'term', 'abuse', 'first', 'used', 'early', 'working', 'class', 'radical', 'including', 'digger', 'english', 'revolution', 'sans', 'culotte', 'french', 'revolution', 'whilst', 'term', 'still', 'used', 'pejorative', 'way', 'describe', 'act', 'used', 'violent', 'mean', 'destroy', 'society', 'also', 'taken', 'positive', 'label', 'self', 'defined', 'anarchist', 'word', 'anarchism', 'derived', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'political', 'philosophy', 'belief', 'ruler', 'unnecessary', 'abolished', 'although', 'differing', 'interpretation', 'mean', 'anarchism', 'also', 'refers', 'related', 'social', 'movement', 'advocate', 'elimination', 'authoritarian', 'institution', 'particularly', 'state', 'word', 'anarchy', 'anarchist', 'use', 'imply', 'chaos', 'nihilism', 'anomie', 'rather', 'harmonious', 'anti', 'authoritarian', 'society', 'place', 'regarded', 'authoritarian', 'political', 'structure', 'coercive', 'economic', 'institutio

In [7]:
# Create Dictionary
id2word = corpora.Dictionary(processed_data)

# Create Corpus
texts = processed_data

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 4), (2, 7), (3, 1), (4, 1), (5, 1), (6, 2), (7, 2), (8, 1), (9, 1), (10, 1), (11, 2), (12, 1), (13, 2), (14, 2), (15, 2), (16, 1), (17, 1), (18, 1), (19, 3), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 2), (26, 6), (27, 2), (28, 12), (29, 4), (30, 1), (31, 1), (32, 7), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 2), (39, 4), (40, 2), (41, 9), (42, 3), (43, 1), (44, 2), (45, 1), (46, 1), (47, 1), (48, 1), (49, 7), (50, 2), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 5), (58, 3), (59, 2), (60, 2), (61, 1), (62, 1), (63, 1), (64, 1), (65, 3), (66, 2), (67, 1), (68, 2), (69, 2), (70, 2), (71, 2), (72, 3), (73, 1), (74, 32), (75, 1), (76, 1), (77, 2), (78, 13), (79, 1), (80, 2), (81, 1), (82, 6), (83, 1), (84, 1), (85, 3), (86, 1), (87, 2), (88, 4), (89, 9), (90, 102), (91, 145), (92, 1), (93, 3), (94, 33), (95, 1), (96, 16), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 2), (103, 1), (104, 1), (105, 1), (106, 1), (107, 6), (108, 2), (109, 1),

In [12]:
# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = './mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=32, id2word=id2word)

In [14]:
# Show Topics
print(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=processed_data, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(5, [('greek', 0.01096525176218046), ('bc', 0.01025315541244827), ('roman', 0.007425410487424791), ('death', 0.005841254115194485), ('ancient', 0.005748371982620721), ('century', 0.005335562504515104), ('work', 0.005268480964322941), ('god', 0.005129157765462295), ('rome', 0.005103357173080694), ('alexander', 0.004489303074398588)]), (29, [('film', 0.019323195835051293), ('story', 0.0077723555221878285), ('series', 0.007669790788456134), ('character', 0.007304660336371301), ('movie', 0.006488245055867011), ('book', 0.005218493652268629), ('show', 0.005144647043981809), ('time', 0.004882081325628671), ('comic', 0.004750798466452101), ('television', 0.0038831008190819637)]), (17, [('india', 0.022188514180894597), ('indian', 0.012134894877945534), ('afghanistan', 0.007099266262170171), ('state', 0.006914068011852688), ('hindu', 0.006446662903908565), ('east', 0.006349654296599407), ('british', 0.006040990546070269), ('people', 0.006032171581769437), ('asia', 0.0058469733314519545), ('pak

In [26]:
import random 
import string 

def format_topics_sentences(ldamodel, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                highest_score_topic =  sorted(wp,key=lambda x: (x[0]), reverse=True)[0][0]
                topic_keywords = ", ".join([word for word, prop in wp])
                random_id =''.join([random.choice(string.ascii_letters + string.digits) for n in range(32)]) 
  
                sent_topics_df = sent_topics_df.append(pd.Series([random_id, int(topic_num),highest_score_topic, round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ["Document_ID",'Dominant_Topic', 'Highest_Score_Topic','Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No',"Document_ID",'Dominant_Topic','Highest_Score_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Document_ID,Dominant_Topic,Highest_Score_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,INsJ77pas5StclLP6ruSWC4IGQpDS0ix,20.0,union,0.4424,"party, government, political, economic, libera...","[anarchism, originated, as, a, term, of, abuse..."
1,1,sXr6VPR1ySrJeF6h6qyVmlMl9TTsbu2j,27.0,treatment,0.299,"drug, medical, patient, treatment, effect, bra...","[reciprocity, qualitative, impairments, in, co..."
2,2,ZpvrAZTmAh6ZPmTRlBqKWRcNcLVejyan,28.0,year,0.5952,"war, state, president, year, american, united,...","[with, the, aegis, of, zeus, when, he, goes, t..."
3,3,VGlvjd4DFMGdyU9Ilq16Kx0KeATqTNk2,6.0,world,0.441,"work, science, life, theory, world, human, boo...","[despite, his, injury, booth, managed, to, lim..."
4,4,YLtwR3UlcCGWrB2UZPs9oFuH5LzrXrVm,6.0,world,0.5492,"work, science, life, theory, world, human, boo...","[present, best, sound, editing, one, nine, six..."
5,5,4mFtk2FRD9XZOTLrwrNR7iYlUXvaBkQJ,30.0,water,0.1937,"specie, animal, human, horse, plant, bird, tre...","[murray, rothbard, written, in, one, nine, sev..."
6,6,NKZ95b851jwCwb0Rw8Oy6mMfuv9ubEwq,6.0,world,0.3216,"work, science, life, theory, world, human, boo...","[believes, man, is, nothing, but, a, collectio..."
7,7,uVGchcAiLG4X6r2xSk9cN1EldktHaCBA,6.0,world,0.6253,"work, science, life, theory, world, human, boo...","[that, focused, on, analyzing, how, societies,..."
8,8,AQRmBXHHFwrCxkkiYTIJdAiUr5m4N2Ju,6.0,world,0.3564,"work, science, life, theory, world, human, boo...","[century, when, physicists, were, able, to, co..."
9,9,huTKwfvMZjmVB1R0KGwILQyvGKUYzHXd,9.0,year,0.3232,"island, country, year, government, est, popula...","[mountainous, country, due, to, its, location,..."


In [27]:
#Save as csv 
df_dominant_topic.to_csv(r'./dominant_topics.csv')

In [28]:
# Group top 5 sentences under each topic
def most_repsentative_document(df_topic_sents_keywords): 
    sent_topics_sorteddf_mallet = pd.DataFrame()

    sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

    for i, grp in sent_topics_outdf_grpd:
        sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                                 grp.sort_values(['Perc_Contribution'], ascending=[0]).head(5)], 
                                                axis=0)

    # Reset Index    
    sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

    # Format
    sent_topics_sorteddf_mallet.columns = ["Document_ID",'Topic_Num', "Highest_Score_Topic","Topic_Perc_Contrib", "Keywords", "Text"]
    return sent_topics_sorteddf_mallet


# Show
sent_topics_sorteddf_mallet = most_repsentative_document(df_topic_sents_keywords)
sent_topics_sorteddf_mallet

Unnamed: 0,Document_ID,Topic_Num,Highest_Score_Topic,Topic_Perc_Contrib,Keywords,Text
0,hb6dM8173j0wwBuG7T8F29D04eQ5amQD,0.0,year,0.8659,"team, league, season, game, football, year, pl...","[nfl, took, that, action, as, a, consequence, ..."
1,58txyoKRvLp68bejpiG66Woxcjn9gnyb,0.0,year,0.8124,"team, league, season, game, football, year, pl...","[have, been, celtic, such, as, the, ambrones, ..."
2,2ZQaVlPpNrcV3gypALnsF8WdejObqjvw,0.0,year,0.8072,"team, league, season, game, football, year, pl...","[games, six, and, seven, to, win, the, america..."
3,fDx2P04fwhRSkHCfpzPIrHRf99rEDN9D,0.0,year,0.7260,"team, league, season, game, football, year, pl...","[from, opening, day, until, august, three, one..."
4,NGvpNUz9B2vISsMKjS2ZAMTRbjdhflZD,0.0,year,0.7086,"team, league, season, game, football, year, pl...","[by, chad, hansen, on, line, tao, te, ching, b..."
...,...,...,...,...,...,...
155,u2X43mhF6opDG6nJXHkOKm4MNam6nSOy,31.0,people,0.7964,"jewish, jew, god, book, hebrew, king, judaism,...","[existing, like, solomon, jehoshaphat, sent, s..."
156,Uy8hil6p9ELYTieptAgjO63jwKVGvwHW,31.0,people,0.7709,"jewish, jew, god, book, hebrew, king, judaism,...","[eight, home, runs, to, break, hank, aaron, s,..."
157,aFE5IemWpZWzDScej5R0PEfpIMqcnTHa,31.0,people,0.6701,"jewish, jew, god, book, hebrew, king, judaism,...","[to, amnon, and, so, joab, hatches, a, plan, j..."
158,L7Pp139GFA8F7Hb14Q6O6wEDx9diUUsQ,31.0,people,0.6115,"jewish, jew, god, book, hebrew, king, judaism,...","[move, of, two, four, two, three, one, three, ..."


In [29]:
sent_topics_sorteddf_mallet.to_csv(r'./most_representative_documents.csv')

### Resouces
* https://github.com/RaRe-Technologies/gensim-data
* https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
* https://www.datacamp.com/community/tutorials/stemming-lemmatization-python?utm_source=adwords_ppc&utm_campaignid=10267161064&utm_adgroupid=102842301792&utm_device=c&utm_keyword=&utm_matchtype=b&utm_network=g&utm_adpostion=&utm_creative=278443377095&utm_targetid=dsa-429603003980&utm_loc_interest_ms=&utm_loc_physical_ms=9051476&gclid=EAIaIQobChMI9aaZkoap6gIVVImyCh24nQsqEAAYASAAEgKWg_D_BwE