# Session setup

In [25]:
# basic operations
import os
import re
import logging
import time
import pickle

# utilities
from pprint import pprint as pp

# data analysis/management/manipulation
import numpy as np
import pandas as pd
import sklearn

# text analysis
import gensim
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import LdaModel, ldamodel
from gensim.models import LsiModel, lsimodel
from gensim.models import Phrases
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary
from gensim.similarities import MatrixSimilarity
'''
Uncomment the next two lines of code if you don't have a copy of the
Mallet software installed in your machine
'''
# ! wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip # * updated if needed *
# ! unzip mallet-2.0.8.zip
MALLET_PATH = '/Users/omoi/Documents/SMM694-NLP/mallet-2.0.8/bin/mallet'
import spacy
import en_core_web_lg

# multivariate analysis
from sklearn import manifold
from sklearn.metrics import euclidean_distances
from sklearn.decomposition import PCA

# distance metrics
from scipy.spatial.distance import cosine

# visualization
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pyLDAvis
import pyLDAvis.gensim # directly import the module `gensim'!

# Topic Modelling

## Model building

### Text transformation

In [26]:
# read the dictionary and text corpus
PATH = os.getcwd()
FILE = 'corpus'

# dictionary
in_f = os.path.join(PATH, FILE, 'pr_dictionary.dict')
DICT = Dictionary.load(in_f)

# corpus
in_f = os.path.join(PATH, FILE, 'pr_corpus.mm')
CORPUS = MmCorpus(in_f)

# docs_phrased
in_f = os.path.join(PATH, FILE, 'pr_docs_phrased.pickle')
with open(in_f, 'rb') as pipe:
    docs_phrased = pickle.load(pipe)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/omoi/Documents/SMM694-NLP/corpus/pr_docs_phrased.pickle'

In [19]:
FILE = 'brexit'

# company list
in_f = os.path.join(PATH, FILE, 'companies.csv')
companies = pd.read_csv(in_f)
companies.describe()

Unnamed: 0,country,name
count,182,182
unique,3,182
top,united kingdom,groupe casino
freq,88,1


### Mallet's implementation of LDA algorithm (using Gibbs sampling)

In [20]:
'''
Based on Gardner et al's LQ paper, we expect to see
29 distinctive topics circa
'''
N_TOPICS = 182
LDA_MALLET = gensim.models.wrappers.LdaMallet(MALLET_PATH,
                                              corpus=CORPUS,
                                              num_topics=N_TOPICS,
                                              id2word=DICT,
                                              random_seed=123)
LDA_MALLET.print_topics(num_topics=N_TOPICS, num_words=5)

[(0,
  '0.048*"sea" + 0.033*"island" + 0.031*"coast" + 0.026*"art" + 0.025*"part"'),
 (1,
  '0.034*"july" + 0.024*"carney" + 0.018*"bank_of_england" + 0.018*"good" + 0.016*"george_osborne"'),
 (2,
  '0.087*"reef" + 0.057*"australia" + 0.044*"-PRON-" + 0.034*"barrier" + 0.034*"great"'),
 (3, '0.229*"xc_x" + 0.116*"year" + 0.062*"pay" + 0.031*"uk" + 0.028*"\'s"'),
 (4,
  '0.080*"school" + 0.043*"family" + 0.034*"parent" + 0.033*"trust" + 0.027*"academy"'),
 (5,
  '0.143*"-PRON-" + 0.046*"real" + 0.034*"game" + 0.027*"estate" + 0.027*"feature"'),
 (6,
  '0.028*"continue" + 0.028*"plan" + 0.027*"team" + 0.027*"give" + 0.025*"itv"'),
 (7,
  '0.127*"charity" + 0.047*"good" + 0.037*"people" + 0.033*"surplus" + 0.030*"raise"'),
 (8,
  '0.071*"school" + 0.048*"student" + 0.045*"learn" + 0.027*"environment" + 0.021*"find"'),
 (9,
  '0.051*"valley" + 0.039*"silicon" + 0.033*"start" + 0.033*"tech" + 0.019*"man"'),
 (10,
  '0.049*"year" + 0.049*"low" + 0.045*"bond" + 0.043*"yield" + 0.034*"market"'

## Model evaluation

### Get range of coherence values as the number of retained topics change

In [21]:
# define function
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    -----------
    dictionary : Gensim dictionary
    corpus     : Gensim corpus
    texts      : List of input texts
    limit      : Max number of topics

    Returns:
    --------
    model_list       : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model
                       with respective number of topics
    """
    coherence_values = []
    model_list = []
    mallet_path = MALLET_PATH
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path,
                                                 corpus=corpus,
                                                 num_topics=num_topics,
                                                 id2word=dictionary,
                                                 random_seed=123)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,
                                        texts=texts,
                                        dictionary=dictionary,
                                        coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [22]:
# collect coherence scores as the number of retained topics change
LIMIT, START, STEP = 30, 10, 1
MODEL_LIST, COHER_VALS = compute_coherence_values(dictionary=DICT,
                                                  corpus=CORPUS,
                                                  texts=DOCS_PHRASED,
                                                  start=START,
                                                  limit=LIMIT,
                                                  step=STEP)

NameError: name 'DOCS_PHRASED' is not defined