LDA (LatentDirichletAllocation): [link](https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0)

In [1]:
import pandas as pd
import re

from sklearn.datasets import fetch_20newsgroups

In [2]:
seed = 94487

In [12]:
df = pd.read_csv('./data/water_problem_nlp_en_for_Kaggle_100.csv',sep=';')
df.sample(5, random_state=seed)

Unnamed: 0,text,env_problems,pollution,treatment,climate,biomonitoring
47,Main part (86%) of phosphate ions in wastewate...,1,1.0,,,
36,The order of values of its organic forms estab...,1,1.0,,,1.0
62,Southern Bug basin is located in the right-ban...,0,,,,
90,Ladyzhyn water storage reservoir due to its bi...,1,1.0,,,1.0
65,At Podilska and Dnieper Elevations relief is f...,0,,,,


In [28]:
# train = fetch_20newsgroups(subset='train')
# test = fetch_20newsgroups(subset='test')

In [13]:
# df = pd.DataFrame({'text':train.data})
# df.sample(5, random_state=seed)

In [14]:
# Remove punctuation
df['text_processed'] = \
df['text'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
df['text_processed'] = \
df['text_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
df.sample(5, random_state=seed)

Unnamed: 0,text,env_problems,pollution,treatment,climate,biomonitoring,text_processed
47,Main part (86%) of phosphate ions in wastewate...,1,1.0,,,,main part (86%) of phosphate ions in wastewate...
36,The order of values of its organic forms estab...,1,1.0,,,1.0,the order of values of its organic forms estab...
62,Southern Bug basin is located in the right-ban...,0,,,,,southern bug basin is located in the right-ban...
90,Ladyzhyn water storage reservoir due to its bi...,1,1.0,,,1.0,ladyzhyn water storage reservoir due to its bi...
65,At Podilska and Dnieper Elevations relief is f...,0,,,,,at podilska and dnieper elevations relief is f...


In [15]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data = df.text_processed.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['southern', 'bug', 'basin', 'wastewater', 'discharged', 'organized', 'sources', 'map', 'reference', 'year', 'mln', 'discharged', 'total']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carlosmorote/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1)]


In [23]:
from pprint import pprint

# number of topics
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 5 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.019*"oc" + 0.016*"observed" + 0.015*"temperature" + 0.012*"river" + '
  '0.012*"vinnitsya" + 0.012*"khmelnitsky" + 0.012*"wwtps" + '
  '0.012*"downstream" + 0.012*"volumes" + 0.012*"wastewater"'),
 (1,
  '0.024*"basin" + 0.021*"water" + 0.021*"bodies" + 0.014*"southern" + '
  '0.014*"bug" + 0.014*"organic" + 0.014*"phosphorus" + 0.014*"within" + '
  '0.014*"discharged" + 0.014*"kg"'),
 (2,
  '0.022*"nitrogen" + 0.021*"part" + 0.015*"waters" + 0.012*"also" + '
  '0.012*"lower" + 0.012*"basin" + 0.011*"surface" + 0.011*"average" + '
  '0.009*"vinnistya" + 0.009*"inorganic"'),
 (3,
  '0.042*"nitrogen" + 0.024*"part" + 0.019*"basin" + 0.014*"main" + '
  '0.014*"tons" + 0.012*"forms" + 0.010*"compounds" + 0.010*"wastewater" + '
  '0.010*"khmelnitsky" + 0.010*"kirovograd"'),
 (4,
  '0.020*"wastewater" + 0.014*"days" + 0.014*"values" + 0.014*"temperature" + '
  '0.014*"settlements" + 0.011*"water" + 0.010*"tons" + 0.010*"oc" + '
  '0.010*"many" + 0.010*"treatment"'),
 (5,
  '0.045*"

In [24]:
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

  default_term_info = default_term_info.sort_values(


In [34]:
from gensim.test.utils import common_corpus, common_dictionary

# TODO: No usar el common_dictionary. Hay que extrapolar o generalizar exactamente el mismo que para los modelos. Tal vez con eso lo que devuelva tenga algo de sentido también.

interest_themes = [['human']]
interest_themes_corpus = [common_dictionary.doc2bow(text) for text in interest_themes]
sorted(lda_model[interest_themes_corpus[0]], key=lambda x: x[1], reverse=True)

[(8, 0.5499279),
 (6, 0.050020903),
 (5, 0.05001715),
 (1, 0.050012782),
 (3, 0.05000857),
 (7, 0.05000403),
 (2, 0.050003275),
 (4, 0.05000265),
 (9, 0.050002478),
 (0, 0.050000336)]