In [0]:
import json
import requests
import urllib

# 0. Preliminary step to get sample data

This preliminary step is reproducing Lorella's workflow Python file:
https://i-lab.public.data.uu.nl/vault-ocex/ChroniclItaly%20-%20Italian%20American%20newspapers%20corpus%20from%201898%20to%201920%5B1529330521%5D/original/
I just added a folder "data_1" to keep all files in one folder

In [0]:
mkdir 'data1'

In [0]:
# Base URL
chronam = 'https://chroniclingamerica.loc.gov/'

# Chronicling America search results
results = 'https://chroniclingamerica.loc.gov/search/pages/results/?date1=1880&date2=1920&searchType=advanced&language=ita&sequence=1&lccn=2012271201&lccn=sn85066408&lccn=sn85055164&lccn=sn85054967&lccn=sn88064299&lccn=sn84037024&lccn=sn84037025&lccn=sn86092310&proxdistance=5&state=California&state=District+of+Columbia&state=Massachusetts&state=Pennsylvania&state=Piedmont&state=Vermont&state=West+Virginia&rows=100&ortext=&proxtext=&phrasetext=&andtext=&dateFilterType=yearRange&page=11&sort=date'

# Count to keep track of downloaded files
count = 0

# Gets search results in JSON format
results_json = results + '&format=json'


In [0]:
# Returns JSON 
def get_json(url):
    data = requests.get(url)
    return(json.loads(data.content))
    
data = get_json(results_json)

In [0]:
# Cycle through JSON results
for page in data['items']:
    # Create URL
    hit = str(page['id'])
    seed = hit + 'ocr.txt'
    download_url = chronam + seed
 
    # Create file name
    file_name = download_url.replace('/', '_')
    file_name = 'data1/' + file_name[41:]
    
    # Download .txt of the page
    urllib.request.urlretrieve(download_url, str(file_name))
    count += 1

# 1. Data preparation

## 1.1. Grouping all texts files
A dataframe is first created to keep individual files at their initial state, and the name of each file

In [0]:
import os
import pandas as pd

In [0]:
#list of the file names
files_list = os.listdir('data1')

In [0]:
#insert file names into a df
sources = pd.DataFrame(files_list, columns=['file_name'])

In [0]:
#function to read the content of the text files
def readTxtContent(fileName):
  with open('data1/' + fileName, 'r') as file:
    return ' ' + file.read().replace('\n', ' ') + ' '

In [0]:
# adding a column to the dataframe containing file content
sources['file_content'] = sources['file_name'].apply(lambda x: readTxtContent(x))

In [90]:
# for verification purposes later, count the nr of characters for each content
sources['file_len'] = sources['file_content'].apply(lambda x: len(x))
sources['file_len'].sum()

1779770

In [0]:
# variable containing all texts together
corpus = ''
for i in range(len(sources)):
  corpus += sources['file_content'][i]

In [122]:
# check length
len(corpus)

1779770

## 1.2 Pre-processing options

Options for the user to work on lower cased version, exclude short words, remove punctuation, remove stop words

In [0]:
import spacy
from spacy.tokenizer import Tokenizer

import re


In [0]:
#adding punctuation rules (splitting punctuation that is before or after a word with no whitespace) as
# it is not included by default within the tokenizer

custom_nlp = Italian() # language

prefix_re = spacy.util.compile_prefix_regex(custom_nlp.Defaults.prefixes)
suffix_re = spacy.util.compile_suffix_regex(custom_nlp.Defaults.suffixes)

def customize_tokenizer(nlp):
    # Adds support to use `-` as the delimiter for tokenization
    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     token_match=None
                     )


custom_nlp.tokenizer = customize_tokenizer(custom_nlp)


In [0]:
# not using directly the "nlp(corpus)" from spacy as the length of the 
# corpus variable exceeds the 1000000 length limitation from spacy. I'm using a workaround with "tokenizer()"
# and it's not really good because the tokenization results are not satisfactory with this. It makes pre-processing
# more complex than with NLTK and losing some of Spacy's functionalities. 
# this opens a question about switching back to NLTK for the pre processing steps until tokenization
# and only use some of Spacy functionalities (for ex. lemmatization). it depends on how much we want to use Spacy
# later in the process. If the modelling/categorization parts only happen with Gensim, then I think it's ok to switch back
# to NLTK for some pre-processing steps.
# Another option would be to divide the corpus in smaller chunks, but I don't know how this impacts (or not) further
# steps if Spacy is used at a later stage (would the models be accurate?). Then, if for the more advanced steps
# Gensim is used exclusively and not Spacy, I think that NLTK could mostly be used for pre-processing

In [0]:
tokens = custom_nlp.tokenizer(corpus)

In [0]:
#uncomment to check results
# ! I see that Spacy defaults prefix/suffix removers don't handle the cases of capital letters + punctuation without whitespaces
# for token in tokens[0:70]:
#    print(token.text, token.lemma_, token.is_alpha, token.is_stop)

In [0]:
# lower case tokens
tokens_low = [token.lower_ for token in tokens]

In [0]:
# tokens excluding punctuation, white spaces, and words smaller than 3 letters
tokens_punct_size = [token.orth_ for token in tokens if not token.is_punct | token.is_space | len(token.text) < 4]

In [0]:
# uncomment last line of this cell to show list of default Italian stopwords
from spacy.lang.it.stop_words import STOP_WORDS
# STOP_WORDS

In [0]:
# combining all of the above and excluding default stop words list
tokens_nostop = [token.lower_ for token in tokens if not (token.is_stop or token.is_punct or token.is_space or len(token.text) < 4)]

In [0]:
# to add/remove stop words depending on user input
# nlp.Defaults.stop_words |= {"my_new_stopword1","my_new_stopword2",}
# nlp.Defaults.stop_words -= {"whatever", "whenever"}

## 1.3 Stem 

In [0]:
# Stemming is available via NLTK and not Spacy

In [0]:
import nltk
from nltk.stem.snowball import SnowballStemmer

In [0]:
#initialize with needed language
stemmer = SnowballStemmer("italian")

In [0]:
stemmed_corpus = [stemmer.stem(w) for w in tokens_nostop]

## 1.4 Lemmatize

In [0]:
tokens_lemma = [token.lemma_ for token in tokens if not (token.is_stop or token.is_punct or token.is_space or len(token.text) < 4)]

# 2. Models

In [0]:
from collections import defaultdict
from gensim import corpora

In [0]:
dataset = [d.split() for d in tokens_nostop]

In [0]:
dictionary = corpora.Dictionary(dataset)

In [0]:
corpus = [dictionary.doc2bow(text) for text in dataset]

In [0]:
from gensim import models

tfidf = models.TfidfModel(corpus) #basic gensim model

In [0]:
corpus_tfidf = tfidf[corpus]

In [0]:
# initialize transformation with 5 topics (need to find how to calculate the optimum number of topics with gensim)
lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5)  
corpus_lsi = lsi_model[corpus_tfidf] 

In [143]:
# see results
# ! There seems to be words such as "daily" "with"...
lsi_model.print_topics()

[(0,
  '1.000*"roma" + -0.002*"dispaccio" + -0.001*"united" + -0.001*"camera" + -0.001*"napoli" + -0.001*"condizioni" + 0.001*"genova" + 0.001*"western" + 0.001*"giustizia" + -0.001*"terni"'),
 (1,
  '-0.960*"dispaccio" + -0.277*"daily" + 0.038*"italiano" + 0.027*"italiani" + -0.006*"telegrafico" + 0.005*"cardinale" + -0.002*"roma" + 0.001*"politica" + -0.001*"piroscafi" + 0.001*"with"'),
 (2,
  '0.995*"italiano" + -0.071*"daily" + 0.060*"dispaccio" + -0.010*"cardinale" + 0.010*"telegrafico" + 0.004*"italiana" + 0.002*"popolo" + 0.002*"società" + 0.001*"napoli" + -0.001*"italiani"'),
 (3,
  '-0.883*"telegrafico" + 0.348*"cardinale" + 0.275*"daily" + 0.129*"papa" + -0.070*"dispaccio" + 0.036*"italiano" + -0.005*"italiani" + 0.003*"italiana" + 0.002*"difesa" + -0.002*"dare"'),
 (4,
  '0.990*"papa" + 0.100*"telegrafico" + -0.088*"cardinale" + -0.028*"daily" + 0.007*"dispaccio" + -0.003*"specialmente" + -0.003*"italiano" + -0.003*"amici" + 0.003*"sindaco" + 0.003*"vive"')]