In [1]:
from datetime import datetime
import numpy as np
import json
import glob

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
from nltk.corpus import stopwords

#vis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import regex as re

nlp = spacy.load("en_core_web_sm")

In [2]:
with open("constitution.txt","r", encoding="utf-8") as f:
    corpus = f.read()

corpus=re.sub(r"\n"," ",corpus)

# stops = stopwords.words("english")
print(len(corpus.split(' ')))

#word ~ 7505 words
# python split() ~ 7641 

7641


# <font color = 'limegreen'> General cleaning & Remove Stops</font>

In [3]:
def clean_stops(doc,stops):
    #regex scrubber:
    final=[]
    clean=re.sub(r"AC\/\d{1,4}\/\d{1,4}", "",doc)
    # clean=clean.translate(str.maketrans("","",string.punctuation))
    for token in clean.split():
        if token not in stops:
            final.append(token)

    final=" ".join(final)
    return final

def clean_docs(docs):
    stops = stopwords.words("english")
    final=[]
    for doc in docs:
        final.append(clean_stops(doc,stops))
    return final


In [4]:
clean = clean_docs(corpus.split())

In [5]:
len(clean_docs(corpus.split(".")))

154

In [6]:
clean_sents = clean_docs(corpus.split("."))

In [7]:
clean_joined = " ".join(clean)

In [8]:
clean1=list(filter(None,clean))
len(list(clean1))

4101

In [9]:
len(clean) - len(clean1)

3396

In [10]:
3447/len(clean)  #% of stopwords

0.4597839135654262

# <font color = 'limegreen'> Tokenize Sentences: [IMP]

In [11]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(clean_sents))
print(data_words[:2])

[['we', 'people', 'united', 'states', 'order', 'form', 'perfect', 'union', 'establish', 'justice', 'insure', 'domestic', 'tranquility', 'provide', 'common', 'defence', 'promote', 'general', 'welfare', 'secure', 'blessings', 'liberty', 'posterity', 'ordain', 'establish', 'constitution', 'united', 'states', 'america'], ['the', 'constitutional', 'convention', 'article', 'section', 'congress', 'all', 'legislative', 'powers', 'herein', 'granted', 'shall', 'vested', 'congress', 'united', 'states', 'shall', 'consist', 'senate', 'house', 'representatives']]


# <font color = 'limegreen'> Lemmatization

In [12]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV','NER']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [13]:
data_words=lemmatization(data_words)

In [14]:
%%time


# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  


bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)


# Faster way to get a sentence clubbed as a trigram/bigram


# See trigram example
# print(trigram_mod[bigram_mod[data_words]])

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


# data_words_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_words)
bi=[]
tri=[]
for layer1 in data_bigrams_trigrams:
    for layer2 in layer1:
        if layer2.count("_") == 1 and layer2 not in bi:
            bi.append(layer2)
        elif layer2.count("_") == 2 and layer2 not in tri:
            tri.append(layer2)
        else:
            pass

len(bi), len(tri)


Wall time: 37.9 ms


(1, 1)

In [15]:
bi, tri

(['appropriate_legislation'], ['article_appropriate_legislation'])

In [16]:
%%time
from gensim.models import TfidfModel
id2word = corpora.Dictionary(data_bigrams_trigrams)
texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[0][0:20])

tfidf  = TfidfModel(corpus, id2word=id2word)

LOW_VALUE = 0.03
words = []
words_missing_in_tfidf=[]
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < LOW_VALUE]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]
Wall time: 38.9 ms


In [17]:
%%time
import pyLDAvis

HP_NUM_TOPICS = [5,10,15,20]
HP_NUM_TERMS = [10,20,25,30]

for topic, term in zip(HP_NUM_TOPICS,HP_NUM_TERMS):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=topic,
                                               random_state=100,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha="auto")
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim_models.prepare(lda_model, corpus=corpus,dictionary= id2word, mds="mmds", R=term)

    pyLDAvis.save_html(vis,fr'LDA_Constitution_{topic}topics_{term}terms.html')

  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(
  default_term_info = default_term_info.sort_values(


Wall time: 13.7 s
