In [0]:
import json
import requests
import urllib

# 0. Preliminary step to get sample data

This preliminary step is reproducing Lorella's workflow Python file:
https://i-lab.public.data.uu.nl/vault-ocex/ChroniclItaly%20-%20Italian%20American%20newspapers%20corpus%20from%201898%20to%201920%5B1529330521%5D/original/
I just added a folder "data_1" to keep all files in one folder

In [0]:
mkdir 'data1'

In [0]:
# Base URL
chronam = 'https://chroniclingamerica.loc.gov/'

# Chronicling America search results
results = 'https://chroniclingamerica.loc.gov/search/pages/results/?date1=1880&date2=1920&searchType=advanced&language=ita&sequence=1&lccn=2012271201&lccn=sn85066408&lccn=sn85055164&lccn=sn85054967&lccn=sn88064299&lccn=sn84037024&lccn=sn84037025&lccn=sn86092310&proxdistance=5&state=California&state=District+of+Columbia&state=Massachusetts&state=Pennsylvania&state=Piedmont&state=Vermont&state=West+Virginia&rows=100&ortext=&proxtext=&phrasetext=&andtext=&dateFilterType=yearRange&page=11&sort=date'

# Count to keep track of downloaded files
count = 0

# Gets search results in JSON format
results_json = results + '&format=json'


In [0]:
# Returns JSON 
def get_json(url):
    data = requests.get(url)
    return(json.loads(data.content))
    
data = get_json(results_json)

In [0]:
# Cycle through JSON results
for page in data['items']:
    # Create URL
    hit = str(page['id'])
    seed = hit + 'ocr.txt'
    download_url = chronam + seed
 
    # Create file name
    file_name = download_url.replace('/', '_')
    file_name = 'data1/' + file_name[41:]
    
    # Download .txt of the page
    urllib.request.urlretrieve(download_url, str(file_name))
    count += 1

# 1. Data preparation

## 1.1. Grouping all texts files
A dataframe is first created to keep individual files at their initial state, and the name of each file

In [0]:
import os
import pandas as pd

In [0]:
#list of the file names
files_list = os.listdir('data1')

In [0]:
#insert file names into a df
sources = pd.DataFrame(files_list, columns=['file_name'])

In [0]:
#function to read the content of the text files
def readTxtContent(fileName):
  with open('data1/' + fileName, 'r') as file:
    return ' ' + file.read().replace('\n', ' ') + ' '

In [0]:
# adding a column to the dataframe containing file content
sources['file_content'] = sources['file_name'].apply(lambda x: readTxtContent(x))

In [167]:
# for verification purposes later, count the nr of characters for each content
sources['file_len'] = sources['file_content'].apply(lambda x: len(x))
sources['file_len'].sum()

1779770

In [0]:
# variable containing all texts together
corpus = ''
for i in range(len(sources)):
  corpus += sources['file_content'][i]

In [169]:
# check length
len(corpus)

1779770


## 1.2 Removing stop words

In [170]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
# remove punctuation and lower case (then depending on user input, leave the possibility to do one or the other)
tokens = nltk.word_tokenize(corpus)
tokenized_corpus = [w.lower() for w in tokens if w.isalnum()]
# lower case, remove punctuation and only keep words that have more than 3 letters
tokenized_corpus = [w.lower() for w in tokens if (w.isalnum() and len(w) > 3 )]

In [0]:
# show list of default Italian stopwords
# stopwords.words('italian')

In [0]:
# add custom stop word
ital_stopwords = stopwords.words('italian')
# to append list of words added by user: ital_stopwords.extend(user_input)
# to remove words: ital_stopwords.remove(user_input)

In [0]:
# "stopwords.words('italian')" can be replaced by a custom list input by the user
tokenized_corpus_without_sw = [w for w in tokenized_corpus if not w in stopwords.words('italian')]

## 1.3 Stem

In [0]:
from nltk.stem.snowball import SnowballStemmer

In [0]:
#initialize with needed language
stemmer = SnowballStemmer("italian")

In [0]:
stemmed_corpus = [stemmer.stem(w) for w in tokenized_corpus_without_sw]

## 1.4 Lemmatize

In [0]:
#Lemmatize is available in multiple languages in Spacy and not in NLTK (only English)

In [0]:
from spacy.lang.it import Italian
it_nlp = Italian()

In [0]:
#I tested spacy Italian lemmatization functionality below, and I'm not very convinced by the result
# test = it_nlp('Salvini: "Resteremo in aula finché vedremo i fatti". Il pd Orlando: "Le banche non stanno facendo il loro lavoro"')
# print([token.lemma_ for token in test])

In [0]:
# create dataframe with corpus
corpus_df = pd.DataFrame(tokenized_corpus_without_sw, columns=['tokens'])

In [0]:
# add column with lemmatized version
corpus_df['lemmatized_token'] = corpus_df['tokens'].apply(lambda x: [token.lemma_ for token in it_nlp(x)] )

# 2. Models

In [0]:
from collections import defaultdict
from gensim import corpora

In [0]:
dataset = [d.split() for d in tokenized_corpus_without_sw]

In [0]:
dictionary = corpora.Dictionary(dataset)

In [0]:
corpus = [dictionary.doc2bow(text) for text in dataset]

In [0]:
from gensim import models
model = models.LdaModel(corpus, id2word=dictionary, num_topics=5)

In [183]:
model.print_topics()

[(0,
  '0.011*"roma" + 0.009*"gran" + 0.009*"governo" + 0.007*"voti" + 0.007*"giornale" + 0.007*"parigi" + 0.006*"tempo" + 0.006*"generale" + 0.006*"essere" + 0.006*"ogni"'),
 (1,
  '0.014*"dispaccio" + 0.009*"egli" + 0.009*"dopo" + 0.008*"fatto" + 0.008*"anni" + 0.007*"quali" + 0.007*"anno" + 0.007*"luglio" + 0.007*"giorni" + 0.007*"francisco"'),
 (2,
  '0.017*"italia" + 0.014*"stato" + 0.009*"italiano" + 0.009*"oggi" + 0.009*"tutte" + 0.007*"altri" + 0.007*"quando" + 0.007*"avere" + 0.006*"città" + 0.006*"nulla"'),
 (3,
  '0.010*"ministro" + 0.009*"york" + 0.007*"italiani" + 0.006*"parte" + 0.006*"stati" + 0.006*"grande" + 0.005*"direttore" + 0.005*"pubblica" + 0.005*"napoli" + 0.004*"numero"'),
 (4,
  '0.012*"telegrafico" + 0.008*"daily" + 0.007*"italiana" + 0.007*"sempre" + 0.006*"dice" + 0.005*"quotidiano" + 0.005*"malgrado" + 0.004*"grandi" + 0.004*"chiesa" + 0.004*"membro"')]