In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy
!python -m spacy download en_core_web_sm

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB 9.9 MB/s eta 0:00:02
     ---- ----------------------------------- 1.4/12.8 MB 17.5 MB/s eta 0:00:01
     --------- ------------------------------ 3.0/12.8 MB 24.1 MB/s eta 0:00:01
     -------------- ------------------------- 4.5/12.8 MB 26.4 MB/s eta 0:00:01
     ------------------- -------------------- 6.1/12.8 MB 27.8 MB/s eta 0:00:01
     ----------------------- ---------------- 7.4/12.8 MB 26.3 MB/s eta 0:00:01
     ---------------------------- ----------- 9.0/12.8 MB 28.7 MB/s eta 0:00:01
     ------------------------------- ------- 10.3/12.8 MB 28.5 MB/s eta 0:00:01
     ---------------------------------- ---- 11.5/12.8 MB 31.2 MB/s eta 0:00:01
     ----------------------------

In [2]:
import pandas as pd
df = pd.read_csv(
    "../rawData/sampler_10ktexts_perdecade.ALL2.tsv",
    sep='\t',
    names=["timePeriod", "index", "content"]
)

df.head()

Unnamed: 0,timePeriod,index,content
0,180X.POS.rand,1,The_DT hon_NN ._SENT
1,180X.POS.rand,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,180X.POS.rand,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,180X.POS.rand,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,180X.POS.rand,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [3]:
df['timePeriod'] = df['timePeriod'].map(lambda x: x.rstrip('X.POS.rand'))
df['timePeriod'] = df['timePeriod'].astype(str) + '0'

In [4]:
df['timePeriod'] = pd.to_datetime(df['timePeriod'], format='%Y')
df['timePeriod'] = pd.DatetimeIndex(df['timePeriod'])
df = df[df["timePeriod"].isin(pd.date_range("1800-01-01", "1820-01-01"))]
df.tail()

Unnamed: 0,timePeriod,index,content
21802,1820-01-01,9981,The_DT governor-general_NN of_IN India_NP had_...
21803,1820-01-01,9982,We_PP have_VBP disdained_VBN to_TO run_VB a_DT...
21804,1820-01-01,9983,Mr._NP Philips_NP thought_VBD it_PP quite_RB u...
21805,1820-01-01,9984,The_DT chief_JJ justice_NN sent_VBN into_IN th...
21806,1820-01-01,9985,_'' <lb/> Lord_NP Bacon_NP adds_VBZ this_DT br...


In [5]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(nltk.corpus.stopwords.words('english'))
stop_words.update({"hon"})


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield gensim.utils.simple_preprocess(str(sentence), deacc=True)

data = df.content.values.tolist()
data_words = list(sent_to_words(data))

pprint(data_words[:3])

[['the_dt', 'hon_nn'],
 ['the_dt',
  'gallant_jj',
  'general_nn',
  'who_wp',
  'commanded_vbd',
  'well_rb',
  'knew_vbd',
  'that_in',
  'would_md',
  'soon_rb',
  'arrive_vb'],
 ['but_cc',
  'mr',
  'pitt_np',
  'said_vbd',
  'he_pp',
  'doubted_vbd',
  'whether_in',
  'it_pp',
  'would_md',
  'not_rb',
  'be_vb',
  'necessary_jj',
  'to_to',
  'insert_vb',
  'a_dt',
  'clause_nn',
  'in_in',
  'the_dt',
  'bill_nn',
  'i_nn',
  'for_in',
  'that_dt',
  'purpose_nn',
  'in_in',
  'order_nn',
  'to_to',
  'have_vb',
  'the_dt',
  'thing_nn',
  'done_vbn',
  'as_in',
  'effectually_rb',
  'and_cc',
  'distinctly_rb',
  'as_in',
  'it_pp',
  'should_md',
  'be_vb']]


In [7]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[3]]])

['and_cc', 'dr', 'hussey_np', 'who_wp', 'informs_vbz', 'us_pp', 'that_dt', 'romish_jj', 'bishop_nn', 'of_in', 'waterford_np', 'appointed_vbn', 'by_in', 'the_dt', 'pope_nn', 'in_in', 'a_dt', 'a_dt', 'pastoral_jj', 'letter_nn', 'published_vbn', 'by_in', 'him_pp_in_in', 'london_np', 'and_cc', 'dublin_np', 'in_in', 'not_rb', 'only_rb', 'holds_vbz', 'the_dt', 'same_jj', 'doctrine_nn', 'but_cc', 'forbids_vbz', 'all_dt', 'romanists_nns', 'under_in', 'pain_nn', 'of_in_to_to', 'permit_vb', 'any_dt_of_in', 'their_pp', 'children_nns', 'under_in', 'any_dt', 'pretence_nn', 'to_to', 'resort_vb', 'to_to', 'a_dt', 'protestant_jj', 'school_nn', 'in_in', 'the_dt', 'same_jj', 'pamphlet_nn', 'he_pp', 'addresses_vbz', 'the_dt', 'romish_jj', 'soldiery_nn', 'and_cc', 'exhorts_vbz', 'them_pp_by_in', 'no_dt', 'means_vbz', 'to_to', 'obey_vb', 'their_pp', 'officers_nns', 'in_in', 'any_dt', 'orders_nns', 'relating_vbg', 'to_to', 'spiritual_jj', 'concerns_nns', 'without_in', 'particularly_rb', 'specifying_vbg', 'w

In [12]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word.split('_')[0] for word in simple_preprocess(str(doc)) if word.split('_')[0] not in stop_words] for doc in texts]


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
# Remove Stop Words
print(data_words[0])
data_words_nostops = remove_stopwords(data_words)
print(data_words_nostops[:3])
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
import spacy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=["NN", "NNS", "NNP", "NNPS", "JJ", "JJR", "JJS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "RB", "RBR", "RBS"])

print(data_lemmatized[:1])

['the_dt', 'hon_nn']
[[], ['gallant', 'general', 'commanded', 'well', 'knew', 'would', 'soon', 'arrive'], ['mr', 'pitt', 'said', 'doubted', 'whether', 'would', 'necessary', 'insert', 'clause', 'bill', 'purpose', 'order', 'thing', 'done', 'effectually', 'distinctly'], ['dr', 'hussey', 'informs', 'us', 'romish', 'bishop', 'waterford', 'appointed', 'pope', 'pastoral', 'letter', 'published', 'london', 'dublin', 'holds', 'doctrine', 'forbids', 'romanists', 'pain', 'permit', 'children', 'pretence', 'resort', 'protestant', 'school', 'pamphlet', 'addresses', 'romish', 'soldiery', 'exhorts', 'means', 'obey', 'officers', 'orders', 'relating', 'spiritual', 'concerns', 'without', 'particularly', 'specifying', 'means', 'spiritual', 'concerns', 'reserving', 'romish', 'priesthood', 'stating', 'officer', 'enforce', 'obedience', 'orders', 'relating', 'spiritual', 'concerns', 'officer', 'might', 'feel', 'effects', 'conduct', 'day', 'battle', 'romish', 'soldier', 'might', 'turn', 'upon', 'assassinate', '

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [6]:
papers['timePeriod'] = papers['timePeriod'].map(lambda x: x.rstrip('X.POS.rand'))
papers['timePeriod'] = papers['timePeriod'].astype(str) + '0'
papers['timePeriod'] = pd.to_datetime(papers['timePeriod'], format='%Y')
papers['timePeriod'] = pd.DatetimeIndex(papers['timePeriod']) #.year
papers.head()

Unnamed: 0,timePeriod,index,rawText
0,1800-01-01,1,The_DT hon_NN ._SENT
1,1800-01-01,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,1800-01-01,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,1800-01-01,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,1800-01-01,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [50]:
#reducedPapers = papers[papers["timePeriod"].isin(pd.date_range("1800-01-01", "1830-01-01"))]
#reducedPapers = papers[papers["timePeriod"].isin(pd.date_range("1800-01-01", "1890-01-01"))]
reducedPapers = papers[papers["timePeriod"].isin(pd.date_range("1800-01-01", "1820-01-01"))]
#reducedPapers = papers
reducedPapers.head()
#reducedPapers.tail()

Unnamed: 0,timePeriod,index,rawText
0,1800-01-01,1,The_DT hon_NN ._SENT
1,1800-01-01,2,The_DT gallant_JJ general_NN who_WP commanded_...
2,1800-01-01,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d..."
3,1800-01-01,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ..."
4,1800-01-01,5,In_IN former_JJ times_NNS and_CC in_IN former_...


In [38]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(nltk.corpus.stopwords.words('english'))
wn = WordNetLemmatizer()

def preprocess_text(text):
    tokens = [word.split('_')[0] for word in nltk.word_tokenize(text.lower())]
    words = [word for word in tokens if word.isalpha()]
    lemmas = [wn.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(lemmas)

reducedPapers = df

reducedPapers.loc[:, 'processedText'] = reducedPapers['content'].apply(preprocess_text)
reducedPapers.loc[:, 'processedText'] = reducedPapers['processedText'].astype("str")



reducedPapers.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sandr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,timePeriod,index,content,processedText
0,1800-01-01,1,The_DT hon_NN ._SENT,hon
1,1800-01-01,2,The_DT gallant_JJ general_NN who_WP commanded_...,gallant general commanded well knew reinforcem...
2,1800-01-01,3,"But_CC ,_, Mr._NP Pitt_NP said_VBD ,_, he_PP d...",pitt said doubted whether would necessary inse...
3,1800-01-01,4,"And_CC Dr._NP Hussey_NP ,_, who_WP informs_VBZ...",hussey informs u romish bishop waterford appoi...
4,1800-01-01,5,In_IN former_JJ times_NNS and_CC in_IN former_...,former time former war invasion often threaten...


In [52]:
reducedPapers.to_csv('./processedData/processedData.csv', sep=',')

In [53]:
test = pd.read_csv(
    "./processedData/processedData.csv",
    sep=','
)
test.tail()

Unnamed: 0.1,Unnamed: 0,timePeriod,index,rawText,processedText
21802,21802,1820-01-01,9981,The_DT governor-general_NN of_IN India_NP had_...,india applied individual punishment prescribed...
21803,21803,1820-01-01,9982,We_PP have_VBP disdained_VBN to_TO run_VB a_DT...,disdained run race popularity nation order sec...
21804,21804,1820-01-01,9983,Mr._NP Philips_NP thought_VBD it_PP quite_RB u...,philip thought quite unworthy house time occup...
21805,21805,1820-01-01,9984,The_DT chief_JJ justice_NN sent_VBN into_IN th...,chief justice sent court common plea ask opini...
21806,21806,1820-01-01,9985,_'' <lb/> Lord_NP Bacon_NP adds_VBZ this_DT br...,lord bacon add brief sentence pregnant truth o...
