In [0]:
import json
import requests
import urllib

# 0. Preliminary step to get sample data



This preliminary step is reproducing Lorella's workflow Python file:
https://i-lab.public.data.uu.nl/vault-ocex/ChroniclItaly%20-%20Italian%20American%20newspapers%20corpus%20from%201898%20to%201920%5B1529330521%5D/original/
I just added a folder "data_1" to keep all files in one folder

In [0]:
mkdir 'data1'

In [0]:
# Base URL
chronam = 'https://chroniclingamerica.loc.gov/'

# Chronicling America search results
results = 'https://chroniclingamerica.loc.gov/search/pages/results/?date1=1880&date2=1920&searchType=advanced&language=ita&sequence=1&lccn=2012271201&lccn=sn85066408&lccn=sn85055164&lccn=sn85054967&lccn=sn88064299&lccn=sn84037024&lccn=sn84037025&lccn=sn86092310&proxdistance=5&state=California&state=District+of+Columbia&state=Massachusetts&state=Pennsylvania&state=Piedmont&state=Vermont&state=West+Virginia&rows=100&ortext=&proxtext=&phrasetext=&andtext=&dateFilterType=yearRange&page=11&sort=date'

# Count to keep track of downloaded files
count = 0

# Gets search results in JSON format
results_json = results + '&format=json'


In [0]:
# Returns JSON 
def get_json(url):
    data = requests.get(url)
    return(json.loads(data.content))
    
data = get_json(results_json)

In [0]:
files_list = []
# Cycle through JSON results
for page in data['items']:
    # Create URL
    hit = str(page['id'])
    seed = hit + 'ocr.txt'
    download_url = chronam + seed
 
    # Create file name
    file_name = download_url.replace('/', '_')
    files_list.append(file_name[41:])
    file_name = 'data1/' + file_name[41:]

    # Download .txt of the page
    urllib.request.urlretrieve(download_url, str(file_name))
    count += 1

# 1. Data preparation

## 1.1. Creating data frame
A dataframe is first created to keep the documents at their initial state, and the name of each file

In [0]:
import os
import pandas as pd

In [0]:
#insert file names into a df
sources = pd.DataFrame(files_list, columns=['file_name'])

In [0]:
#function to read the content of the text files
def readTxtContent(fileName):
  with open('data1/' + fileName, 'r') as file:
    return ' ' + file.read().replace('\n', ' ') + ' '

In [0]:
# adding a column to the dataframe containing file content
sources['file_content'] = sources['file_name'].apply(lambda x: readTxtContent(x))

In [0]:
# variable containing the documents separately
corpus = sources['file_content']

## 1.2 Removing stop words, punctuation, short words

In [0]:
%%capture
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [0]:
# add tokenized documents in dataframe
sources['tokens'] = sources['file_content'].apply(lambda x: nltk.word_tokenize(x))

In [0]:
# add new column in df with processed tokens
sources['tokens_prep'] = sources['tokens'].apply(lambda x: [w.lower() for w in x if (w.isalnum() and len(w) > 3 )])

In [0]:
# these lines are useful if we want to provide alternate stop words lists (NLTK)
# show list of default NLTK Italian stopwords
# stopwords.words('italian')
# ital_stopwords = stopwords.words('italian')
# to append list of words added by user: ital_stopwords.extend(user_input)
# to remove words: ital_stopwords.remove(user_input)

In [0]:
# spacy list of Stop words (seems to be more complete than NLTK)
import spacy
from spacy.lang.it.stop_words import STOP_WORDS

In [0]:
spacy_it_sw = STOP_WORDS

In [0]:
# add column with tokenized documents without sw
sources['tokens_prep_nostop'] = sources['tokens_prep'].apply(lambda x: [w for w in x if not w in spacy_it_sw])

## 1.3 Stem

In [0]:
from nltk.stem.snowball import SnowballStemmer

In [0]:
#initialize with needed language
stemmer = SnowballStemmer("italian")

In [0]:
# add column with stemmed tokens
sources['tokens_stemmed'] = sources['tokens_prep_nostop'].apply(lambda x: [stemmer.stem(w) for w in x])

## 1.4 Lemmatize

In [0]:
# Lemmatization is available in multiple languages in Spacy and not in NLTK (only English)
# With Spacy, lemmatization is available for 10 languages. There's also a multi-language option that
# should be tested if additional languages are needed

In [0]:
%%capture
!python -m spacy download it_core_news_sm

In [0]:
import it_core_news_sm
it_nlp = it_core_news_sm.load(disable=['tagger', 'parser', 'ner'])

In [0]:
# lemmatization function
def lemmatize(doc):
  lemmatized_doc = []
  for w in doc:
    w_lemma = [token.lemma_ for token in it_nlp(w)]
    lemmatized_doc.append(w_lemma[0])
  return lemmatized_doc

In [0]:
# add column with lemmatized tokens
sources['tokens_lemmatized'] = sources['tokens_prep_nostop'].apply(lambda x: lemmatize(x))

In [0]:
# variable with lemmatized tokens
lemmatized_corpus = sources['tokens_lemmatized']

# 2. Topics with LDA

In [0]:
#Gensim installation
import gensim
from gensim.test.utils import common_corpus, common_dictionary
from gensim import corpora, models
from gensim.models.wrappers import LdaMallet

## 2.1 Preliminary steps to run LDA



### 2.1.1 Bag of words

In [0]:
# dataset = [d.split() for d in lemmatized_corpus] (this is not useful if lemmatized version is used)
# Create Dictionary
# change "lemmatized_corpus" variable by stemmed_corpus or tokenized_corpus_without_sw depending
# on which version you would like to work with 
dictionary = corpora.Dictionary(lemmatized_corpus)
corpus = [dictionary.doc2bow(text) for text in lemmatized_corpus]

## 2.2 LDA

In [0]:
# set the number of topics here
numtopics = 6
ldamodel = models.LdaModel(corpus, num_topics=numtopics)

## 2.3. Statistics 

In [0]:
# converting the corpus into a numpy sparse matrix for efficient arithmetic operations
corpus_csc = gensim.matutils.corpus2csc(corpus, num_terms=len(dictionary))

### 2.3.1. Terms frequency and documents length

In [0]:
import numpy as np

In [0]:
terms_id = np.asarray(list(dictionary.token2id.values()), dtype=np.int_)
sr_terms_id = pd.Series(terms_id, name='vocab')

In [0]:
term_frequency = corpus_csc.sum(axis=1).A.ravel()[terms_id]
sr_term_frequency = pd.Series(term_frequency,name='term_frequency')

In [0]:
doc_length = corpus_csc.sum(axis=0).A.ravel()
sr_doc_length = pd.Series(doc_length, name='doc_length')

### 2.3.2. Documents topic distribution

In [0]:
# topic weights for each document in the corpus
doc_topic_weights = ldamodel.inference(corpus)[0]
# normalize weights
doc_topic_dists = doc_topic_weights / doc_topic_weights.sum(axis=1)[:, None]
# put data into dataframe
df_doc_topic_dists = pd.DataFrame(doc_topic_dists)
df_doc_topic_dists.index.name = 'doc'
df_doc_topic_dists.columns.name = 'topic'

### 2.3.3. Topic terms distribution

In [0]:
topic_terms = ldamodel.state.get_lambda() # topics term matrix: https://stackoverflow.com/questions/42289858/extract-topic-word-probability-matrix-in-gensim-ldamodel
topic_terms_dist_norm = topic_terms / topic_terms.sum(axis=1)[:, None]
topic_term_dists = topic_terms_dist_norm[:, terms_id]
topic_term_dists_df = pd.DataFrame(topic_term_dists)
df_doc_topic_dists.index.name = 'topic'
df_doc_topic_dists.columns.name = 'term'

### 2.3.4. Topic proportion

In [0]:
topic_freq = (df_doc_topic_dists.T * sr_doc_length).T.sum()

In [0]:
topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False)

### 2.3.5. Sorting all data according to topic proportion

In [0]:
topic_order = topic_proportion.index

In [0]:
topic_freq_ordered = topic_freq[topic_order]

In [0]:
topic_term_dists_ordered = topic_term_dists_df.iloc[topic_order]

In [0]:
doc_topic_dists_ordered = df_doc_topic_dists[topic_order]

### 2.3.6. Marginal distribution over terms

In [0]:
term_proportion = sr_term_frequency / sr_term_frequency.sum()

### 2.3.7. Saliency

In [0]:
topic_given_term = topic_term_dists_ordered / topic_term_dists_ordered.sum()
kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
distinctiveness = kernel.sum()
saliency = term_proportion * distinctiveness

In [0]:
# Number of terms to display stats for
R = 30
# note for interface: R = min(R, len(sr_terms_id)) check if R is smaller than the nr of terms,
# otherwise take nr of terms

In [0]:
# stats for all topics
default_term_info = pd.DataFrame({
  'saliency': saliency,
  'Term': sr_terms_id,
  'Freq': sr_term_frequency,
  'Total': sr_term_frequency,
  'Category': 'Default'})

In [0]:
# Sort terms for the "default" view by decreasing saliency and display only the R first lines:
default_term_info = default_term_info.sort_values(
  by='saliency', ascending=False).head(R).drop('saliency', 1)
# Rounding Freq and Total to integer values
default_term_info['Freq'] = np.floor(default_term_info['Freq'])
default_term_info['Total'] = np.floor(default_term_info['Total'])
ranks = np.arange(R, 0, -1)
default_term_info['logprob'] = default_term_info['loglift'] = ranks

### 2.3.8. Relevance and top terms for each topic

In [0]:
log_lift = np.log(topic_term_dists_ordered / term_proportion)
log_ttd = np.log(topic_term_dists_ordered)
lambda_seq = np.arange(0, 1 + 0.01, 0.01) # lambda_step=0.01

In [0]:
def topic_top_term_df(tup):
        new_topic_id, (original_topic_id, topic_terms) = tup
        term_ix = topic_terms.unique()
        return pd.DataFrame({'Term': vocab[term_ix],
                             'Freq': term_topic_freq.loc[original_topic_id, term_ix],
                             'Total': term_frequency[term_ix],
                             'logprob': log_ttd.loc[original_topic_id, term_ix].round(4),
                             'loglift': log_lift.loc[original_topic_id, term_ix].round(4),
                             'Category': 'Topic%d' % new_topic_id})

In [0]:
from joblib import Parallel, delayed, cpu_count

In [167]:
top_terms = pd.concat(Parallel(n_jobs=-1)
                          (delayed(_find_relevance_chunks)(log_ttd, log_lift, R, ls)
                          for ls in _job_chunks(lambda_seq, n_jobs))) #n jobs = -1
topic_dfs = map(topic_top_term_df, enumerate(top_terms.T.iterrows(), 1))

NameError: ignored

In [0]:
pd.concat([default_term_info] + list(topic_dfs), sort=True)

In [0]:
def _token_table(topic_info, term_topic_freq, vocab, term_frequency):
    # last, to compute the areas of the circles when a term is highlighted
    # we must gather all unique terms that could show up (for every combination
    # of topic and value of lambda) and compute its distribution over topics.

    # term-topic frequency table of unique terms across all topics and all values of lambda
    term_ix = topic_info.index.unique()
    term_ix = np.sort(term_ix)

    top_topic_terms_freq = term_topic_freq[term_ix]
    # use the new ordering for the topics
    K = len(term_topic_freq)
    top_topic_terms_freq.index = range(1, K + 1)
    top_topic_terms_freq.index.name = 'Topic'

    # we filter to Freq >= 0.5 to avoid sending too much data to the browser
    token_table = pd.DataFrame({'Freq': top_topic_terms_freq.unstack()})\
        .reset_index().set_index('term').query('Freq >= 0.5')

    token_table['Freq'] = token_table['Freq'].round()
    token_table['Term'] = vocab[token_table.index.values].values
    # Normalize token frequencies:
    token_table['Freq'] = token_table.Freq / term_frequency[token_table.index]
    return token_table.sort_values(by=['Term', 'Topic'])

In [0]:
def _chunks(l, n):
    """ Yield successive n-sized chunks from l.
    """
    for i in range(0, len(l), n):
        yield l[i:i + n]


def _job_chunks(l, n_jobs):
    n_chunks = n_jobs
    if n_jobs < 0:
        # so, have n chunks if we are using all n cores/cpus
        n_chunks = cpu_count() + 1 - n_jobs

    return _chunks(l, n_chunks)
    
def _find_relevance(log_ttd, log_lift, R, lambda_):
    relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift
    return relevance.T.apply(lambda s: s.sort_values(ascending=False).index).head(R)


def _find_relevance_chunks(log_ttd, log_lift, R, lambda_seq):
    return pd.concat([_find_relevance(log_ttd, log_lift, R, l) for l in lambda_seq])

## 3.3. Result (data files)

In [0]:
topic_info = _topic_info(topic_term_dists, topic_proportion,
                             term_frequency, term_topic_freq, vocab, lambda_step, R, n_jobs)

NameError: ignored

In [0]:
topic_info

Unnamed: 0,Category,Freq,Term,Total,loglift,logprob
458,Default,1001.000000,italiano,1001.000000,30.0000,30.0000
278,Default,457.000000,dispaccio,457.000000,29.0000,29.0000
995,Default,341.000000,venire,341.000000,28.0000,28.0000
456,Default,504.000000,italia,504.000000,27.0000,27.0000
384,Default,311.000000,giornale,311.000000,26.0000,26.0000
...,...,...,...,...,...,...
1463,Topic6,6.049451,leggere,165.590347,-0.0748,-6.7254
638,Topic6,6.078680,parigi,168.948654,-0.0900,-6.7206
531,Topic6,6.484509,mentire,223.855637,-0.3068,-6.6560
295,Topic6,6.376319,dovere,219.120758,-0.3023,-6.6728


In [0]:
import csv
topic_info.to_csv('topic_info.csv')

In [0]:
topic_proportion.to_csv('topic_proportion.csv')

In [0]:
topic_proportion

topic
1    0.319186
3    0.196606
2    0.180878
0    0.143365
4    0.120596
5    0.039370
dtype: float32