In [0]:
import json
import requests
import urllib

# 0. Preliminary step to get sample data

This preliminary step is reproducing Lorella's workflow Python file:
https://i-lab.public.data.uu.nl/vault-ocex/ChroniclItaly%20-%20Italian%20American%20newspapers%20corpus%20from%201898%20to%201920%5B1529330521%5D/original/
I just added a folder "data_1" to keep all files in one folder

In [0]:
mkdir 'data1'

In [0]:
# Base URL
chronam = 'https://chroniclingamerica.loc.gov/'

# Chronicling America search results
results = 'https://chroniclingamerica.loc.gov/search/pages/results/?date1=1880&date2=1920&searchType=advanced&language=ita&sequence=1&lccn=2012271201&lccn=sn85066408&lccn=sn85055164&lccn=sn85054967&lccn=sn88064299&lccn=sn84037024&lccn=sn84037025&lccn=sn86092310&proxdistance=5&state=California&state=District+of+Columbia&state=Massachusetts&state=Pennsylvania&state=Piedmont&state=Vermont&state=West+Virginia&rows=100&ortext=&proxtext=&phrasetext=&andtext=&dateFilterType=yearRange&page=11&sort=date'

# Count to keep track of downloaded files
count = 0

# Gets search results in JSON format
results_json = results + '&format=json'


In [0]:
# Returns JSON 
def get_json(url):
    data = requests.get(url)
    return(json.loads(data.content))
    
data = get_json(results_json)

In [0]:
# Cycle through JSON results
for page in data['items']:
    # Create URL
    hit = str(page['id'])
    seed = hit + 'ocr.txt'
    download_url = chronam + seed
 
    # Create file name
    file_name = download_url.replace('/', '_')
    file_name = 'data1/' + file_name[41:]
    
    # Download .txt of the page
    urllib.request.urlretrieve(download_url, str(file_name))
    count += 1

# 1. Data preparation

## 1.1. Grouping all texts files
A dataframe is first created to keep individual files at their initial state, and the name of each file

In [0]:
import os
import pandas as pd

In [0]:
#list of the file names
files_list = os.listdir('data1')

In [0]:
#insert file names into a df
sources = pd.DataFrame(files_list, columns=['file_name'])

In [0]:
#function to read the content of the text files
def readTxtContent(fileName):
  with open('data1/' + fileName, 'r') as file:
    return ' ' + file.read().replace('\n', ' ') + ' '

In [0]:
# adding a column to the dataframe containing file content
sources['file_content'] = sources['file_name'].apply(lambda x: readTxtContent(x))

In [11]:
# for verification purposes later, count the nr of characters for each content
sources['file_len'] = sources['file_content'].apply(lambda x: len(x))
sources['file_len'].sum()

1779770

In [0]:
# variable containing all texts together
corpus = ''
for i in range(len(sources)):
  corpus += sources['file_content'][i]

In [13]:
# check length
len(corpus)

1779770


## 1.2 Removing stop words, punctuation, short words

In [14]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
# remove punctuation and lower case (then depending on user input, leave the possibility to do one or the other)
tokens = nltk.word_tokenize(corpus)
tokenized_corpus = [w.lower() for w in tokens if w.isalnum()]
# lower case, remove punctuation and only keep words that have more than 3 letters
tokenized_corpus = [w.lower() for w in tokens if (w.isalnum() and len(w) > 3 )]

In [0]:
# show list of default Italian stopwords
# stopwords.words('italian')

In [0]:
# add custom stop word
ital_stopwords = stopwords.words('italian')
# to append list of words added by user: ital_stopwords.extend(user_input)
# to remove words: ital_stopwords.remove(user_input)

In [0]:
import spacy
from spacy.lang.it.stop_words import STOP_WORDS

In [0]:
spacy_it_sw = STOP_WORDS

In [0]:
# "stopwords.words('italian')" can be replaced by a custom list input by the user
tokenized_corpus_without_sw = [w for w in tokenized_corpus if not w in spacy_it_sw]

## 1.3 Stem

In [0]:
from nltk.stem.snowball import SnowballStemmer

In [0]:
#initialize with needed language
stemmer = SnowballStemmer("italian")

In [0]:
stemmed_corpus = [stemmer.stem(w) for w in tokenized_corpus_without_sw]

## 1.4 Lemmatize

In [0]:
# Lemmatize is available in multiple languages in Spacy and not in NLTK (only English)
# With Spacy, lemmatization is available for 10 languages. There's also a multi-language option that
# should be tested if additional languages are needed

In [0]:

!python -m spacy download it_core_news_sm

Collecting it_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-2.2.5/it_core_news_sm-2.2.5.tar.gz (14.5MB)
[K     |████████████████████████████████| 14.5MB 674kB/s 
Building wheels for collected packages: it-core-news-sm
  Building wheel for it-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for it-core-news-sm: filename=it_core_news_sm-2.2.5-cp36-none-any.whl size=14471130 sha256=2573639ffcae6abd5f2952a66713f35eaea2b6651b9a2b4c05266cd0b7037719
  Stored in directory: /tmp/pip-ephem-wheel-cache-5kpha7u0/wheels/a1/01/c2/127ab92cc5e3c7f36b5cd4bff28d1c29c313962a2ba913e720
Successfully built it-core-news-sm
Installing collected packages: it-core-news-sm
Successfully installed it-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('it_core_news_sm')


In [0]:
import it_core_news_sm
it_nlp = it_core_news_sm.load(disable=['tagger', 'parser', 'ner'])

In [0]:
# create dataframe with corpus
corpus_df = pd.DataFrame(tokenized_corpus_without_sw, columns=['tokens'])
# to test only on 30 lines, use: corpus_df = pd.DataFrame(tokenized_corpus_without_sw[0:30], columns=['tokens'])

In [0]:
# add column with lemmatized version
corpus_df['lemmatized_token'] = corpus_df['tokens'].apply(lambda x: [token.lemma_ for token in it_nlp(x)] )

# 2. Models

In [0]:
# installation setup that works for Mallet: https://github.com/polsci/colab-gensim-mallet/blob/master/topic-modeling-with-colab-gensim-mallet.ipynb
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

In [0]:
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip mallet-2.0.8.zip

In [0]:
os.environ['MALLET_HOME'] = '/content/mallet-2.0.8'
mallet_path = '/content/mallet-2.0.8/bin/mallet'

In [0]:
import gensim
from gensim.test.utils import common_corpus, common_dictionary
from gensim import corpora, models
from gensim.models.wrappers import LdaMallet

In [0]:
dataset = [d.split() for d in tokenized_corpus_without_sw]

# Create Dictionary
id2word = corpora.Dictionary(dataset)

# Create Corpus
texts = dataset

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [64]:
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [65]:
ldamallet.print_topics()

[(0,
  '0.026*"york" + 0.017*"giornale" + 0.017*"società" + 0.014*"francia" + 0.013*"viaggio" + 0.011*"capo" + 0.011*"colla" + 0.011*"parole" + 0.010*"strada" + 0.008*"altra"'),
 (1,
  '0.058*"italiana" + 0.027*"mente" + 0.011*"sera" + 0.010*"paesi" + 0.008*"edoardo" + 0.008*"perchè" + 0.008*"vuol" + 0.008*"ministero" + 0.007*"membri" + 0.007*"marina"'),
 (2,
  '0.020*"parigi" + 0.019*"mano" + 0.013*"francesi" + 0.013*"cose" + 0.012*"disse" + 0.012*"chiesa" + 0.011*"regina" + 0.010*"roosevelt" + 0.010*"presso" + 0.010*"politica"'),
 (3,
  '0.027*"patria" + 0.015*"patrizi" + 0.014*"presidente" + 0.013*"giustizia" + 0.012*"amici" + 0.011*"mento" + 0.010*"circulation" + 0.010*"venezia" + 0.009*"montgomery" + 0.008*"operai"'),
 (4,
  '0.057*"roma" + 0.011*"delia" + 0.011*"leone" + 0.008*"lire" + 0.008*"londra" + 0.008*"socialisti" + 0.007*"viene" + 0.006*"inglese" + 0.006*"inglesi" + 0.006*"porti"'),
 (5,
  '0.056*"italia" + 0.021*"quotidiano" + 0.015*"genova" + 0.010*"milano" + 0.010*"zio