In [59]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# To make the notebook look more clean
import warnings
warnings.filterwarnings("ignore")

In [18]:
# Importing Dataset
with open(r"C:\Users\Bara Elshaer\Documents\Python\Data Science\Capstone Project\Main Capstone Project Two\USAID-Mexico-CDCS_updated2020.txt") as m:
    mexico = (m.read())

In [85]:
mexico[:70]

' \n\n \n\n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\nCentral America and Mexico (CAM) \n\nC'

### Clean up the data 

As you can see there are alot of ne line characters and weird spaces. I do my best to clean that up in the next code of block. The fist thing I do is to convert the data into a python list. 

In [60]:
# Convert to list 
mexico_corpus_list = mexico.split("\n")

# As you can see there is alot of newline and extra spaces that is quite distracting. I get rid of it using regular expressions

# Remove new line characters
mexico_clean_corpus = [re.sub('\s+', ' ', sent) for sent in mexico_corpus_list]

# Remove distracting single quotes
mexico_clean_corpus = [re.sub("\'", "", sent) for sent in mexico_corpus_list]

pprint(mexico_clean_corpus[:100])



[' ',
 '',
 ' ',
 '',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 'Central America and Mexico (CAM) ',
 '',
 'C:\\Users\\sthomasarrigo\\Downloads\\Lockup_CA_RGB_HIGH.bmp',
 'U:\\El Sal\\RDCS\\Phase III\\unnamed.png',
 'Regional Development Cooperation Strategy ',
 '',
 '2015-2019 ',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 ' ',
 '',
 '\x0c',
 'Table of Contents ',
 '',
 ' ',
 'EXECUTIVE SUMMARY '
 '................................................................................................................................. '
 '3 ',
 'DEVELOPMENT CONTEXT, CHALLENGES, AND OPPORTUNITIES '
 '................................................................ 5 ',
 'Alignment with U.S. Foreign Policy Considerations, Regional Policies and '
 'Priorities ........................... 9 ',
 'Critical Assumptions and Risks '
 '...........................................................................................

### Tokenize words and Clean-up text
I tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.

Gensim’s simple_preprocess() is great for this. Additionally I have set deacc=True to remove the punctuations.

In [37]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

tokenized_clean_corpus = list(sent_to_words(mexico_clean_corpus))

# To remove the weird brakets 
for element in tokenized_clean_corpus:
    if element == []:
        tokenized_clean_corpus.remove(element)

Creating bigram and trigram models:

In [91]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(tokenized_clean_corpus, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[tokenized_clean_corpus], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Sanity check
print(trigram_mod[bigram_mod[tokenized_clean_corpus[115]]])



['cost', 'effective', 'implementation', 'mechanisms', 'available', 'for', 'buy', 'in', 'importantly', 'usaid', 'will', 'leverage', 'its']


The bigrams model is ready. Let’s define the functions to remove the stopwords, make bigrams and lemmatization and call them sequentially.

In [61]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])


# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [51]:
# Remove Stop Words
data_words_nostops = remove_stopwords(tokenized_clean_corpus)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:20])

[['cam'], ['download'], ['unnamed', 'png'], ['strategy'], ['table', 'content'], ['executive', 'summary'], ['challenge', 'opportunity'], ['alignment', 'foreign', 'policy', 'consideration', 'regional', 'policy', 'priority'], ['critical', 'assumption', 'risk'], ['regional', 'criterion'], ['development', 'objective'], ['development'], ['regional', 'economic', 'integration', 'increase'], ['regional', 'climate', 'economic_growth', 'enhance'], ['regional', 'citizen_security', 'improve'], ['contain'], ['analysis', 'evaluation', 'evidence', 'inform', 'strategy', 'process'], ['forward'], ['local', 'solution'], ['science', 'technology', 'innovation', 'partnership']]


### Create the Dictionary and BoW needed for Topic Modeling

The two main inputs to the LDA topic model are the dictionary(id2word) and the term document frequency (aka bag-of-words). Let’s create them.

In [55]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency (BoW)
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[0:20])

[[(0, 1)], [(1, 1)], [(2, 1), (3, 1)], [(4, 1)], [(5, 1), (6, 1)], [(7, 1), (8, 1)], [(9, 1), (10, 1)], [(11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1)], [(17, 1), (18, 1), (19, 1)], [(16, 1), (20, 1)], [(21, 1), (22, 1)], [(21, 1)], [(16, 1), (23, 1), (24, 1), (25, 1)], [(16, 1), (26, 1), (27, 1), (28, 1)], [(16, 1), (29, 1), (30, 1)], [(31, 1)], [(4, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1)], [(37, 1)], [(38, 1), (39, 1)], [(40, 1), (41, 1), (42, 1), (43, 1)]]


In [57]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           passes=10)

In [62]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis