In [0]:
import json
import requests
import urllib

# 0. Preliminary step to get sample data

This preliminary step is reproducing Lorella's workflow Python file:
https://i-lab.public.data.uu.nl/vault-ocex/ChroniclItaly%20-%20Italian%20American%20newspapers%20corpus%20from%201898%20to%201920%5B1529330521%5D/original/
I just added a folder "data_1" to keep all files in one folder

In [0]:
mkdir 'data1'

In [0]:
# Base URL
chronam = 'https://chroniclingamerica.loc.gov/'

# Chronicling America search results
results = 'https://chroniclingamerica.loc.gov/search/pages/results/?date1=1880&date2=1920&searchType=advanced&language=ita&sequence=1&lccn=2012271201&lccn=sn85066408&lccn=sn85055164&lccn=sn85054967&lccn=sn88064299&lccn=sn84037024&lccn=sn84037025&lccn=sn86092310&proxdistance=5&state=California&state=District+of+Columbia&state=Massachusetts&state=Pennsylvania&state=Piedmont&state=Vermont&state=West+Virginia&rows=100&ortext=&proxtext=&phrasetext=&andtext=&dateFilterType=yearRange&page=11&sort=date'

# Count to keep track of downloaded files
count = 0

# Gets search results in JSON format
results_json = results + '&format=json'


In [0]:
# Returns JSON 
def get_json(url):
    data = requests.get(url)
    return(json.loads(data.content))
    
data = get_json(results_json)

In [0]:
# Cycle through JSON results
for page in data['items']:
    # Create URL
    hit = str(page['id'])
    seed = hit + 'ocr.txt'
    download_url = chronam + seed
 
    # Create file name
    file_name = download_url.replace('/', '_')
    file_name = 'data1/' + file_name[41:]
    
    # Download .txt of the page
    urllib.request.urlretrieve(download_url, str(file_name))
    count += 1

# 1. Data preparation

## 1.1. Grouping all texts files
A dataframe is first created to keep individual files at their initial state, and the name of each file

In [0]:
import os
import pandas as pd

In [0]:
#list of the file names
files_list = os.listdir('data1')

In [0]:
#insert file names into a df
sources = pd.DataFrame(files_list, columns=['file_name'])

In [0]:
#function to read the content of the text files
def readTxtContent(fileName):
  with open('data1/' + fileName, 'r') as file:
    return ' ' + file.read().replace('\n', ' ') + ' '

In [0]:
# adding a column to the dataframe containing file content
sources['file_content'] = sources['file_name'].apply(lambda x: readTxtContent(x))

In [12]:
# for verification purposes later, count the nr of characters for each content
sources['file_len'] = sources['file_content'].apply(lambda x: len(x))
sources['file_len'].sum()

1779770

In [0]:
# variable containing all texts together
corpus = ''
for i in range(len(sources)):
  corpus += sources['file_content'][i]

In [14]:
# check length
len(corpus)

1779770


## 1.2 Removing stop words, punctuation, short words

In [15]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
# remove punctuation and lower case (then depending on user input, leave the possibility to do one or the other)
tokens = nltk.word_tokenize(corpus)
tokenized_corpus = [w.lower() for w in tokens if w.isalnum()]
# lower case, remove punctuation and only keep words that have more than 3 letters
tokenized_corpus = [w.lower() for w in tokens if (w.isalnum() and len(w) > 3 )]

In [0]:
# show list of default NLTK Italian stopwords
# stopwords.words('italian')

In [0]:
# add custom stop word
ital_stopwords = stopwords.words('italian')
# to append list of words added by user: ital_stopwords.extend(user_input)
# to remove words: ital_stopwords.remove(user_input)

In [0]:
# spacy list of Stop words (seems to be more complete)
import spacy
from spacy.lang.it.stop_words import STOP_WORDS

In [0]:
spacy_it_sw = STOP_WORDS

In [0]:
# "stopwords.words('italian')" can be replaced by a custom list input by the user
tokenized_corpus_without_sw = [w for w in tokenized_corpus if not w in spacy_it_sw]

## 1.3 Stem

In [0]:
from nltk.stem.snowball import SnowballStemmer

In [0]:
#initialize with needed language
stemmer = SnowballStemmer("italian")

In [0]:
stemmed_corpus = [stemmer.stem(w) for w in tokenized_corpus_without_sw]

## 1.4 Lemmatize

In [0]:
# Lemmatize is available in multiple languages in Spacy and not in NLTK (only English)
# With Spacy, lemmatization is available for 10 languages. There's also a multi-language option that
# should be tested if additional languages are needed

In [0]:

!python -m spacy download it_core_news_sm

Collecting it_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-2.2.5/it_core_news_sm-2.2.5.tar.gz (14.5MB)
[K     |████████████████████████████████| 14.5MB 674kB/s 
Building wheels for collected packages: it-core-news-sm
  Building wheel for it-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for it-core-news-sm: filename=it_core_news_sm-2.2.5-cp36-none-any.whl size=14471130 sha256=2573639ffcae6abd5f2952a66713f35eaea2b6651b9a2b4c05266cd0b7037719
  Stored in directory: /tmp/pip-ephem-wheel-cache-5kpha7u0/wheels/a1/01/c2/127ab92cc5e3c7f36b5cd4bff28d1c29c313962a2ba913e720
Successfully built it-core-news-sm
Installing collected packages: it-core-news-sm
Successfully installed it-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('it_core_news_sm')


In [0]:
import it_core_news_sm
it_nlp = it_core_news_sm.load(disable=['tagger', 'parser', 'ner'])

In [0]:
# create dataframe with corpus
corpus_df = pd.DataFrame(tokenized_corpus_without_sw, columns=['tokens'])
# to test only on 30 lines, use: corpus_df = pd.DataFrame(tokenized_corpus_without_sw[0:30], columns=['tokens'])

In [0]:
# add column with lemmatized version
corpus_df['lemmatized_token'] = corpus_df['tokens'].apply(lambda x: [token.lemma_ for token in it_nlp(x)] )

# 2. Topics with LDA

In [22]:
# installation setup that works for Mallet: https://github.com/polsci/colab-gensim-mallet/blob/master/topic-modeling-with-colab-gensim-mallet.ipynb
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

openjdk version "11.0.7" 2020-04-14
OpenJDK Runtime Environment (build 11.0.7+10-post-Ubuntu-2ubuntu218.04)
OpenJDK 64-Bit Server VM (build 11.0.7+10-post-Ubuntu-2ubuntu218.04, mixed mode, sharing)


In [23]:
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip mallet-2.0.8.zip

--2020-05-07 12:35:29--  http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
Resolving mallet.cs.umass.edu (mallet.cs.umass.edu)... 128.119.246.70
Connecting to mallet.cs.umass.edu (mallet.cs.umass.edu)|128.119.246.70|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16184794 (15M) [application/zip]
Saving to: ‘mallet-2.0.8.zip’


2020-05-07 12:35:30 (10.8 MB/s) - ‘mallet-2.0.8.zip’ saved [16184794/16184794]

Archive:  mallet-2.0.8.zip
   creating: mallet-2.0.8/
   creating: mallet-2.0.8/bin/
  inflating: mallet-2.0.8/bin/classifier2info  
  inflating: mallet-2.0.8/bin/csv2classify  
  inflating: mallet-2.0.8/bin/csv2vectors  
  inflating: mallet-2.0.8/bin/mallet  
  inflating: mallet-2.0.8/bin/mallet.bat  
  inflating: mallet-2.0.8/bin/mallethon  
  inflating: mallet-2.0.8/bin/prepend-license.sh  
  inflating: mallet-2.0.8/bin/svmlight2vectors  
  inflating: mallet-2.0.8/bin/text2classify  
  inflating: mallet-2.0.8/bin/text2vectors  
  inflating: mallet-2.0.8/bin/

In [0]:
os.environ['MALLET_HOME'] = '/content/mallet-2.0.8'
mallet_path = '/content/mallet-2.0.8/bin/mallet'

In [0]:
import gensim
from gensim.test.utils import common_corpus, common_dictionary
from gensim import corpora, models
from gensim.models.wrappers import LdaMallet

In [0]:
dataset = [d.split() for d in tokenized_corpus_without_sw]

# Create Dictionary
id2word = corpora.Dictionary(dataset)

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in dataset]

In [0]:
# optional
# TFIDF for better performance
# https://en.wikipedia.org/wiki/Tf–idf / 
# https://rare-technologies.com/pivoted-document-length-normalisation/

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [28]:
# LDA without TFIDF
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=5, id2word=id2word)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [29]:
ldamallet.print_topics()

[(0,
  '0.010*"papa" + 0.010*"gran" + 0.008*"cardinale" + 0.007*"italiana" + 0.007*"states" + 0.007*"mente" + 0.006*"parigi" + 0.006*"patrizi" + 0.005*"largest" + 0.005*"visita"'),
 (1,
  '0.020*"dispaccio" + 0.017*"telegrafico" + 0.010*"york" + 0.007*"italian" + 0.007*"presidente" + 0.006*"uniti" + 0.006*"giugno" + 0.005*"zione" + 0.005*"altre" + 0.005*"grandi"'),
 (2,
  '0.018*"roma" + 0.016*"italiano" + 0.008*"nome" + 0.005*"russia" + 0.005*"genova" + 0.004*"camera" + 0.004*"xiii" + 0.004*"giustizia" + 0.004*"conclave" + 0.004*"essa"'),
 (3,
  '0.023*"italia" + 0.011*"italiani" + 0.006*"costa" + 0.006*"marconi" + 0.005*"america" + 0.005*"pietro" + 0.005*"mano" + 0.005*"popolo" + 0.004*"napoli" + 0.004*"navigazione"'),
 (4,
  '0.016*"daily" + 0.009*"giornale" + 0.009*"quotidiano" + 0.008*"numero" + 0.008*"italiana" + 0.007*"maggio" + 0.006*"patria" + 0.006*"perchè" + 0.006*"francisco" + 0.004*"luglio"')]

In [30]:
# LDA with TFIDF
ldamallet_tfidf = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus_tfidf, num_topics=5, id2word=id2word)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [31]:
ldamallet_tfidf.print_topics()

[(0,
  '0.021*"italia" + 0.008*"numero" + 0.007*"maggio" + 0.005*"morte" + 0.005*"popolo" + 0.005*"giustizia" + 0.004*"napoli" + 0.004*"fatti" + 0.004*"operai" + 0.004*"xiii"'),
 (1,
  '0.013*"telegrafico" + 0.011*"italiani" + 0.008*"quotidiano" + 0.008*"nome" + 0.007*"presidente" + 0.007*"states" + 0.006*"uniti" + 0.006*"costa" + 0.006*"perchè" + 0.005*"visita"'),
 (2,
  '0.016*"daily" + 0.015*"italiana" + 0.011*"giornale" + 0.010*"york" + 0.008*"papa" + 0.007*"patria" + 0.005*"lotta" + 0.005*"russia" + 0.004*"zione" + 0.004*"francia"'),
 (3,
  '0.020*"roma" + 0.010*"gran" + 0.008*"parigi" + 0.007*"mente" + 0.005*"cardinali" + 0.005*"grandi" + 0.005*"macchine" + 0.004*"camera" + 0.004*"società" + 0.004*"chiesa"'),
 (4,
  '0.020*"dispaccio" + 0.016*"italiano" + 0.008*"cardinale" + 0.007*"italian" + 0.006*"largest" + 0.006*"giugno" + 0.006*"francisco" + 0.006*"patrizi" + 0.005*"marconi" + 0.005*"pietro"')]

In [0]:
from gensim.models.coherencemodel import CoherenceModel

In [47]:
# coherence score LDA
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=dataset, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


Coherence Score:  0.8471767131173209


In [51]:
# coherence score LDA with TFIDF
coherence_model_ldamallet_tfidf = CoherenceModel(model=ldamallet_tfidf, texts=dataset, dictionary=id2word, coherence='c_v')
coherence_ldamallet_tfidf = coherence_model_ldamallet_tfidf.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet_tfidf)


Coherence Score:  0.8467828507195788


In [0]:
# Function to calculate optimal number of topics
# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus_tfidf, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=dataset, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [70]:
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus_tfidf, texts=dataset, start=2, limit=40, step=1)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [71]:
# Print the coherence scores
limit=20; start=2; step=1;
x = range(start, limit, step)

for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

Num Topics = 2  has Coherence Value of 0.8623
Num Topics = 3  has Coherence Value of 0.8554
Num Topics = 4  has Coherence Value of 0.8513
Num Topics = 5  has Coherence Value of 0.8467
Num Topics = 6  has Coherence Value of 0.8441
Num Topics = 7  has Coherence Value of 0.84
Num Topics = 8  has Coherence Value of 0.8396
Num Topics = 9  has Coherence Value of 0.8355
Num Topics = 10  has Coherence Value of 0.8331
Num Topics = 11  has Coherence Value of 0.8322
Num Topics = 12  has Coherence Value of 0.8293
Num Topics = 13  has Coherence Value of 0.828
Num Topics = 14  has Coherence Value of 0.8271
Num Topics = 15  has Coherence Value of 0.8247
Num Topics = 16  has Coherence Value of 0.8232
Num Topics = 17  has Coherence Value of 0.8228
Num Topics = 18  has Coherence Value of 0.8214
Num Topics = 19  has Coherence Value of 0.8187
