In [None]:
import json
import re
import gensim
import spacy
import tqdm
import logging

# Visualize LDA
import pyLDAvis.gensim
import pickle 
import pyLDAvis

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import gensim.corpora as corpora
import multiprocessing as mp

from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from pprint import pprint

In [None]:
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt='%H:%M:%S', level=logging.INFO)
cores = mp.cpu_count()

In [None]:
depth = 12
df = pd.read_json(f"data/wikipedia_depth_{depth}.json").rename(columns={'Category': 'Path'})
df['Category'] = df['Path'].apply(lambda x: x[-1])

In [None]:
df.head()

# Data cleaning and preparation

In [None]:
flatten = lambda l: [item for sublist in l for item in sublist]

df_unique = df.groupby('Title').agg(list).reset_index()
df_unique['Links'] = df_unique['Links'].apply(flatten).apply(np.unique)
df_unique['Text'] = df_unique['Text'].apply(lambda x: list(set(x))[0])

In [None]:
df_unique.head()

In [None]:
print("Number of unique articles:", len(df_unique))

In [None]:
subset = ['Methane', 'Extinction Rebellion', 'Fuel taxes', 'Hydraulic fracturing', 'Exxonmobil', 'Gazprom', 
          'Self-sustainability', 'Industrial ecology', 'Ecovillages', 'Eco-towns', 'Wildlife smuggling', 
          'Urban forestry', 'Biofuels', 'Sustainable gardening', 'Animal waste products', 'Oil platform disasters',
          'Coal phase-out', 'Climate change denial', 'Building energy rating', 'Active fire protection', 
          'Industrial minerals', 'Composting', 'Reforestation']
subset = [w.lower() for w in subset]

In [None]:
df_subset = df[df['Category'].map(lambda x: x.lower()).isin(subset)].reset_index().drop(columns=['index'])
df_subset.shape

In [None]:
df_subset.head()

In [None]:
df_subset['Text'] = df_subset['Text'].apply(lambda x: re.sub('\n', '. ', x))

In [None]:
df_subset['Text'].head()

### Remove punctuation and lowercase

In [None]:
# Remove punctuation
df_subset['Text_processed'] = df_subset['Text'].map(lambda x: re.sub('[,\.!?]', '', x))
df_subset['Text_processed'] = df_subset['Text'].map(lambda x: x.lower())
df_subset['Text_processed'].head()

### Tokenize words and further clean-up text

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data = df_subset.Text_processed.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

### Phrase Modeling: Bi-grams and Tri-grams

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=50)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

### Remove Stopwords, Make Bigrams and Lemmatize

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        if len(allowed_postags) < 1:
            texts_out.append([token.lemma_ for token in doc])
        else:
            texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])

### Data transformation: Corpus and Dictionary

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1])

# LDA

## Baseline

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=50, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       workers=cores-1,
                                       per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

## Hyperparameter tuning

In [None]:
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=dictionary,
                                       num_topics=k, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       alpha=a,
                                       eta=b,
                                       per_word_topics=True)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
grid = {}
grid['Validation_Set'] = {}

min_topics = 20
max_topics = 100

step_size = 10
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [corpus]
corpus_title = ['Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('data/lda_tuning_results.csv', index=False)
    pbar.close()

In [None]:
df_tuning = pd.read_csv('data/lda_tuning_results.csv')

In [None]:
df_tuning.iloc[df_tuning['Coherence'].argmax()]

In [None]:
test = df_tuning[df_tuning['Topics'] == 60]
test[test['Coherence'] == test['Coherence'].max()]

## Final model

In [None]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                   id2word=id2word,
                                   num_topics=100, 
                                   random_state=100,
                                   chunksize=100,
                                   passes=10,
                                   alpha='symmetric',
                                   eta=0.91,
                                   per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

## Visualize LDA

In [None]:
pyLDAvis.enable_notebook()

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [None]:
df_subset['LDA'] = doc_lda

In [None]:
df_subset.head()

# Sklearn

In [None]:
# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()# Initialise the count vectorizer with the English stop words

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
data_lemmatized = df_subset['Text_processed'].apply(lambda x: lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']))
count_vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 3))# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(data_lemmatized)# Visualise the 10 most common words
#plot_10_most_common_words(count_data, count_vectorizer)

In [None]:
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below
number_topics = 30
number_words = 10# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

In [None]:
from pyLDAvis import sklearn as sklearn_lda

# Get cause-effect relations

In [None]:
causal_terms = ['consequently', 'as a result', 'therefore', 'as a consequence', 'for this reason', 'for all these reasons', 'thus', 'because',
                'since', 'because of', 'on account of', 'due to', 'for the reason', 'so, that']
causal_verbs = ['Permit', 'Make', 'Cause', 'Generate','Allow', 'Wash', 'Affect', 'Cut', 'Change', 'Encourage', 'Develop', 'Force', 'Let', 'Convert', 'Derive', 'Arouse',
                'Criticize', 'Alter', 'Create', 'Extend']
causal_terms = [term.lower() for term in causal_terms]
causal_verbs = [term.lower() for term in causal_verbs]

In [None]:
def causal_relation(text):
    result = []
    for sent in text.split(". "):
        doc = nlp(sent)
        new_sent = []
        if any(token.text in causal_terms or (token.pos_ == 'VERB' and token.lemma_ in causal_verbs) for token in doc):
            result.append(sent)
    return result

In [None]:
df_subset['Causal_relations'] = df_subset['Text'].apply(lambda x: causal_relation(x))

In [None]:
df_subset['Causal_relations'].iloc[8]