In [0]:
import json
import requests
import urllib

# 0. Preliminary step to get sample data

This preliminary step is reproducing Lorella's workflow Python file:
https://i-lab.public.data.uu.nl/vault-ocex/ChroniclItaly%20-%20Italian%20American%20newspapers%20corpus%20from%201898%20to%201920%5B1529330521%5D/original/
I just added a folder "data_1" to keep all files in one folder

In [66]:
mkdir 'data1'

mkdir: cannot create directory ‘data1’: File exists


In [0]:
# Base URL
chronam = 'https://chroniclingamerica.loc.gov/'

# Chronicling America search results
results = 'https://chroniclingamerica.loc.gov/search/pages/results/?date1=1880&date2=1920&searchType=advanced&language=ita&sequence=1&lccn=2012271201&lccn=sn85066408&lccn=sn85055164&lccn=sn85054967&lccn=sn88064299&lccn=sn84037024&lccn=sn84037025&lccn=sn86092310&proxdistance=5&state=California&state=District+of+Columbia&state=Massachusetts&state=Pennsylvania&state=Piedmont&state=Vermont&state=West+Virginia&rows=100&ortext=&proxtext=&phrasetext=&andtext=&dateFilterType=yearRange&page=11&sort=date'

# Count to keep track of downloaded files
count = 0

# Gets search results in JSON format
results_json = results + '&format=json'


In [0]:
# Returns JSON 
def get_json(url):
    data = requests.get(url)
    return(json.loads(data.content))
    
data = get_json(results_json)

In [0]:
# Cycle through JSON results
for page in data['items']:
    # Create URL
    hit = str(page['id'])
    seed = hit + 'ocr.txt'
    download_url = chronam + seed
 
    # Create file name
    file_name = download_url.replace('/', '_')
    file_name = 'data1/' + file_name[41:]
    
    # Download .txt of the page
    urllib.request.urlretrieve(download_url, str(file_name))
    count += 1

# 1. Data preparation

## 1.1. Grouping all texts files
A dataframe is first created to keep individual files at their initial state, and the name of each file

In [0]:
import os
import pandas as pd

In [0]:
#list of the file names
files_list = os.listdir('data1')

In [0]:
#insert file names into a df
sources = pd.DataFrame(files_list, columns=['file_name'])

In [0]:
#function to read the content of the text files
def readTxtContent(fileName):
  with open('data1/' + fileName, 'r') as file:
    return ' ' + file.read().replace('\n', ' ') + ' '

In [0]:
# adding a column to the dataframe containing file content
sources['file_content'] = sources['file_name'].apply(lambda x: readTxtContent(x))

In [0]:
# for verification purposes later, count the nr of characters for each content
sources['file_len'] = sources['file_content'].apply(lambda x: len(x))
sources['file_len'].sum()

1779770

In [0]:
# variable containing all texts together
corpus = ''
for i in range(len(sources)):
  corpus += sources['file_content'][i]

In [0]:
# check length
len(corpus)

1779770

## 1.2 Pre-processing options

Options for the user to work on lower cased version, exclude short words, remove punctuation, remove stop words

In [0]:
import spacy
from spacy.tokenizer import Tokenizer

#setting up for the language
from spacy.lang.it import Italian
nlp = Italian()

In [0]:
#tokenize with the language defaults

tokenizer = Tokenizer(nlp.vocab)
tokens = tokenizer(corpus)

In [0]:
# lower case tokens
tokens_low = [token.lower_ for token in tokens]

In [0]:
# tokens excluding punctuation, white spaces, and words smaller than 3 letters
tokens_punct_size = [token.orth_ for token in tokens if not token.is_punct | token.is_space | len(token.text) < 4]

In [0]:
# uncomment last line of this cell to show list of default Italian stopwords
from spacy.lang.it.stop_words import STOP_WORDS
# STOP_WORDS

In [0]:
# combining all of the above and excluding default stop words list
tokens_nostop = [token.lower_ for token in tokens if not (token.is_stop or token.is_punct or token.is_space or len(token.text) < 4)]

In [0]:
# note: spacy seems not to always remove punctuation within tokens, the reasons/modalities should be further looked into 
# a workaround would be to strip the corpus from punctuation before tokenizing with spacy, but as spacy uses
# punctuation to determine some functions of words in sentences, one might lose some of spacy's functionalities.
# I will look into this more in details at a later stage and start with the default functionalities

In [0]:
# uncomment last line of this cell to show list of default Italian stopwords
from spacy.lang.it.stop_words import STOP_WORDS
# STOP_WORDS

In [0]:
# to add/remove stop words depending on user input
# nlp.Defaults.stop_words |= {"my_new_stopword1","my_new_stopword2",}
# nlp.Defaults.stop_words -= {"whatever", "whenever"}

## 1.3 Stem 

In [0]:
# Stemming is available via NLTK and not Spacy

In [0]:
import nltk
from nltk.stem.snowball import SnowballStemmer

In [0]:
#initialize with needed language
stemmer = SnowballStemmer("italian")

In [0]:
stemmed_corpus = [stemmer.stem(w) for w in tokens_nostop]

## 1.4 Lemmatize

In [0]:
# the following returns a non lower cased form of the token, I need to find how to combine lower case form 
# and lemmatized form

In [0]:
tokens_lemma = [token.lemma_ for token in tokens if not (token.is_stop or token.is_punct or token.is_space or len(token.text) < 4)]