In [267]:
# dataset can be obtained via https://zenodo.org/record/4596345#.Yk2flG5Bz0o
# folder containing one or more sub-folders 
# sub-folders are publications
# within each sub-folder, collection of text files that each constitute a document
# for now, supported naming pattern for text files is as such: '/sn86069873/1900-01-05/'
# LCCN title information and publication date (yyyy-mm-dd)
# users will need to provide publication names that match title information

In [302]:
# indicate name of the folder containing data for ex 'data_tm_workflow'
folder_path = '/Users/mariellacc/Documents/TM/CI_newspaper_subcorpora'

In [303]:
# create a folder for ex 'output_tm_workflow' to store the output files
output_folder = '/Users/mariellacc/Documents/TM/output_tm_workflow'

# 1. Data preparation

In [304]:
import os
import pandas as pd
import re
import datetime
from datetime import date # timedelta, 

## 1.1. Creating data frame
A dataframe is first created to keep the documents at their initial state, and the name of each file

In [305]:
publications_list = os.listdir(folder_path) 

In [306]:
#insert file names into a df
publications = pd.DataFrame(publications_list, columns=['pub_name'])

In [307]:
publications.head()

Unnamed: 0,pub_name
0,L'Italia
1,La_Tribuna_del_Connecticut
2,Il_Patriota
3,La_Rassegna
4,Cronaca_Sovversiva


In [308]:
files_list = []

for pub in publications_list:
    files = os.listdir(f"{folder_path}/{pub}/")
    files_list.append(files)

In [309]:
files_list_flat = [item for sublist in files_list for item in sublist]

In [310]:
#insert file names into a df
sources = pd.DataFrame(files_list_flat, columns=['file_name'])

In [311]:
sources.head()

Unnamed: 0,file_name
0,sn85066408_1913-03-10_ed-1_seq-1_ocr.txt
1,sn85066408_1910-06-16_ed-1_seq-1_ocr.txt
2,sn85066408_1915-05-09_ed-1_seq-1_ocr.txt
3,sn85066408_1898-03-22_ed-1_seq-1_ocr.txt
4,sn85066408_1911-09-01_ed-1_seq-1_ocr.txt


## 1.2. Adding columns for dates, publications and filtering dataset

In [312]:
#get metadata file

metadata_file_path = '/Users/mariellacc/Documents/TM/Metadata_ChroniclItaly 3.0.xlsx'

In [313]:
metadata_df = pd.read_excel(metadata_file_path, header=1)

In [314]:
metadata_df

Unnamed: 0,Title,Place of publication,Publisher,Issues,Number of issues,Total words before intervention,Total words after intervention,Description,Frequency,LCCN,OCLC,ISSN,Related Links,Holdings,History,Notes
0,La sentinella del West Virginia,"Thomas, W. Va.",Sentinel Publ. Co.,From 18/02/1911 to 11/05/1912,53,124988,86839,"Began in Feb., 1905 - Ceased in 1913?",Weekly,sn 86092310,13261125,2473-4918,http://chroniclingamerica.loc.gov/lccn/sn86092...,https://chroniclingamerica.loc.gov/lccn/sn8609...,"West Virginia’s only Italian periodical, La Se...","""Giornale independente."" ""Primo ed unico perio..."
1,L'Italia,"San Francisco, Calif.",Italian Pub. Co.,From 25/01/1897 to 31/12/1919,6489,24584681,17123478,"58mo anno, n. 27 (genn. 31, 1943)\nBegan in 1887",Daily (except Sun. and Mon.),sn 85066408,13073199,/,/,https://chroniclingamerica.loc.gov/lccn/sn8506...,Alternative Titles:\nDaily news\nItalia di Dom...,"Description based on: Vol. 3, no. 12 (genn. 28..."
2,"Cronaca Sovversiva (Barre, Vt.)","Barre, Vt.",Cronaca Sovversiva,From 06/06/1903 to 01/05/1919,771,1825758,1289239,"Ceased in 1920.\nVol. 1, no. 1 (June 6, 1903)-","Weekly Jan. 17, 1920-<Oct. 20, 1920>",2012271201,30383424,2376-3701,http://chroniclingamerica.loc.gov/lccn/2012271...,https://chroniclingamerica.loc.gov/lccn/201227...,Subjects covered: Anarchism--Italy--Periodical...,Archived issues are available in digital forma...
3,La Libera Parola,"Philadelphia, Pa.",A.G. di Silvestro,From 20/04/1918 to 23/12/1922,241,1079122,752309,"20 apr., 1918-July 19, 1969",Weekly,sn 85055164,12632841,2373-373X,http://www.loc.gov/chroniclingamerica/lccn/sn8...,https://chroniclingamerica.loc.gov/lccn/sn8505...,In 1899 Arpino Di Silvestro began publishing a...,"""Italian weekly newspaper.""\n""Published every ..."
4,Il Patriota,"Indiana, Pa.",Patriot Pub. Co.,From 08/08/1914 to 22/10/1921,227,786762,547361,"Ceased in Dec. 1955?\nVol. 1, no. 1 (Aug. 8, 1...",Weekly,sn 85054967,2265753,2373-3713,http://www.loc.gov/chroniclingamerica/lccn/sn8...,https://chroniclingamerica.loc.gov/lccn/sn8505...,Westmoreland and Lycoming Counties provided la...,"""Settimanale indipente bilingue,"" <Aug. 2, 191..."
5,La Ragione,"Philadelphia, Pa.",F. Silvagni,From 25/04/1917 to 23/08/1917,40,126937,86588,"Anno 1, no. 1 (25 apr. 1917)-",Irregular,sn 84037024,10916544,2372-9872,http://www.loc.gov/chroniclingamerica/lccn/sn8...,https://chroniclingamerica.loc.gov/lccn/sn8403...,"Founded in 1917, La Ragione (“The Reason”) was...","""Organo di difesa della italianita, contro i v..."
6,La Rassegna,"Philadelphia, Pa.",La Rassegna Pub. Co.,From 07/04/1917 to 25/08/1917,25,86401,60116,"Anno 1, no. 1 (7 apr. 1917)-",Weekly (irregular),sn 84037025,10916483,2372-9910,http://www.loc.gov/chroniclingamerica/lccn/sn8...,https://chroniclingamerica.loc.gov/lccn/sn8403...,La Rassegna (“The Review”) was a short-lived w...,"""Italian weekly newspaper devoted to welfare a..."
7,L'Indipendente,"New Haven, Conn.",L'indipendente Pub. Co.,From 01/01/1907 to 23/05/1936,48,133872,90758,Began in 1903,Weekly,sn 93053873,27774408,2643-3095,https://chroniclingamerica.loc.gov/lccn/sn9305...,https://chroniclingamerica.loc.gov/lccn/sn9305...,"Headquartered in Wooster Square, one of two ne...","""Established 1903""--Genn. 22, 1922 issue. Also..."
8,La Sentinella,"Bridgeport, Conn.",P. Alteri,From 17/04/1920 to 27/12/1930,518,1674336,1193015,"-anno. 35, no. 36 (10 sett. 1948)",Weekly,sn 84020351,10652874,2643-5098,http://www.loc.gov/chroniclingamerica/lccn/sn8...,https://chroniclingamerica.loc.gov/lccn/sn8402...,"In November 1977, the Bridgeport Post noted th...",Archived issues are available in digital forma...
9,La Tribuna del Connecticut,"Bridgeport, Conn.",Tribuna Pub. Co.,From 03/03/1906 to 09/12/1908,130,330085,224752,"Anno 1, no. 1 (mar. 3, 1906)-anno 2, no. 44 (d...",Weekly,sn 92051386,26498347,2643-5306,http://www.loc.gov/chroniclingamerica/lccn/sn9...,https://chroniclingamerica.loc.gov/lccn/sn9205...,"In the inaugural March, 1906 issue of La Tribu...",Also issued on microfilm from Connecticut Stat...


In [315]:
#LCCN column contains spaces that need to be removed
pub_refs = list(metadata_df['LCCN'].str.replace(" ", ""))
pub_names = list(metadata_df['Title'])

In [316]:
# Alternatively add refs and names manually

# users need to input refs and pub names in same order
# pub_refs = ["sn85066408","2012271201","sn84020351","sn85054967","sn84037024","sn84037025","sn85055164","sn86092310","sn92051386","sn93053873"]
# pub_names = ["L\'Italia","Cronaca Sovversiva","La Sentinella","Il Patriota","La Ragione","La Rassegna","La Libera Parola","La Sentinella del West","La Tribuna del Connecticut","L\'Indipendente"]   

### 1.2.1. dates, publications


In [317]:
# get publication ref from file name
def get_ref(file):
  ref_match = re.findall(r'(\w+\d+)_\d{4}-\d{2}-\d{2}_',file)
  return ref_match[0]

# get date from file name
def get_date(file):
  date_match = re.findall(r'_(\d{4}-\d{2}-\d{2})_',file)
  return date_match[0]


#this part probably not needed
'''
# get year from file name
def get_year(file):
  year_match = re.findall(r'_(\d{4})-\d{2}-\d{2}_',file)
  return year_match[0]

# get month from file name
def get_month(file):
  month_match = re.findall(r'_\d{4}-(\d{2})-\d{2}_',file)
  return month_match[0]

# get day from file name
def get_day(file):
  month_match = re.findall(r'_\d{4}-\d{2}-(\d{2})_',file)
  return month_match[0]
'''

"\n# get year from file name\ndef get_year(file):\n  year_match = re.findall(r'_(\\d{4})-\\d{2}-\\d{2}_',file)\n  return year_match[0]\n\n# get month from file name\ndef get_month(file):\n  month_match = re.findall(r'_\\d{4}-(\\d{2})-\\d{2}_',file)\n  return month_match[0]\n\n# get day from file name\ndef get_day(file):\n  month_match = re.findall(r'_\\d{4}-\\d{2}-(\\d{2})_',file)\n  return month_match[0]\n"

In [318]:
sources['date'] = sources['file_name'].apply(lambda x: get_date(x))
sources['publication'] = sources['file_name'].apply(lambda x: get_ref(x))
#sources['year'] = sources['file_name'].apply(lambda x: get_year(x))
#sources['month'] = sources['file_name'].apply(lambda x: get_month(x))
#sources['day'] = sources['file_name'].apply(lambda x: get_day(x))

In [319]:
sources['publication_name'] = sources['publication'].replace(pub_refs, pub_names)


In [353]:
sources["date"] = pd.to_datetime(sources["date"])

In [354]:
sources.head()

Unnamed: 0,file_name,date,publication,publication_name
0,sn85066408_1913-03-10_ed-1_seq-1_ocr.txt,1913-03-10,sn85066408,L'Italia
1,sn85066408_1910-06-16_ed-1_seq-1_ocr.txt,1910-06-16,sn85066408,L'Italia
2,sn85066408_1915-05-09_ed-1_seq-1_ocr.txt,1915-05-09,sn85066408,L'Italia
3,sn85066408_1898-03-22_ed-1_seq-1_ocr.txt,1898-03-22,sn85066408,L'Italia
4,sn85066408_1911-09-01_ed-1_seq-1_ocr.txt,1911-09-01,sn85066408,L'Italia


In [355]:
sources.tail()

Unnamed: 0,file_name,date,publication,publication_name
8648,sn84037024_1917-07-24_ed-5_seq-1_ocr.txt,1917-07-24,sn84037024,La Ragione
8649,sn84037024_1917-05-05_ed-3_seq-1_ocr.txt,1917-05-05,sn84037024,La Ragione
8650,sn84037024_1917-06-30_ed-2_seq-1_ocr.txt,1917-06-30,sn84037024,La Ragione
8651,sn84037024_1917-07-24_ed-1_seq-1_ocr.txt,1917-07-24,sn84037024,La Ragione
8652,sn84037024_1917-05-16_ed-4_seq-1_ocr.txt,1917-05-16,sn84037024,La Ragione


In [356]:
# checking that there is no empty values
sources.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8653 entries, 0 to 8652
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   file_name         8653 non-null   object        
 1   date              8653 non-null   datetime64[ns]
 2   publication       8653 non-null   object        
 3   publication_name  8653 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 270.5+ KB


### 1.2.2. Filtering / Creating a subset of the dataset

- Setting-up a visualization to assist in choosing the dates by showing the number of documents available by collection and by date 

In [324]:
import duckdb
from observable_jupyter import embed
import ipywidgets as widgets
from ipywidgets import HBox, VBox

In [325]:
# Setting up inputs widgets
# The "agg" input has an impact on data preparation, as it will determine how the collection is aggregated
# All other inputs are for display purpose only and they don't have an impact on data 
color = widgets.RadioButtons(
    description='Color',
    options= ['blue', 'green', 'grey', 'orange'],
    value='blue',
)

agg = widgets.RadioButtons(
    description='Count articles by:',
    options= ['day', 'month', 'year'],
    value='month',
)

scale = widgets.RadioButtons(
    description='Scale by:',
    options= ['height', 'color'],
    value='color',
)

label = widgets.RadioButtons(
    description='Set-up ticks scale',
    options= ['5 year', 'year', 'month', 'day'],
    value='year',
)

In [326]:
def prepare_viz_data(agg, df, col):

    if agg == 'month':
        query = f"SELECT strptime(concat(month, '/', year), '%m/%Y') as date, {col} as publication_name, count FROM (SELECT YEAR(date) as year, MONTH(date) as month, {col}, count(*) as count FROM df GROUP BY {col}, YEAR(date), MONTH(date))"

    elif agg == 'year':
        query = f"SELECT strptime(year, '%Y') as date, {col} as publication_name, count FROM (SELECT YEAR(date) as year, {col}, count(*) as count FROM df GROUP BY {col}, YEAR(date))"

    elif agg == 'day':
        query = f"SELECT strptime(concat('01/', month, '/', year), '%d/%m/%Y') as date, {col} as publication_name, count FROM (SELECT YEAR(date) as year, MONTH(date) as month, {col}, count(*) as count FROM df GROUP BY {col}, YEAR(date), MONTH(date), DAY(date))"

    queried_df = duckdb.query(query).df()

    queried_df = queried_df.astype(str)

    viz_data = queried_df.to_dict('records')

    return viz_data

    

In [327]:
# user inputs
left_box = VBox([color, agg])
right_box = VBox([scale, label])
HBox([left_box, right_box])

HBox(children=(VBox(children=(RadioButtons(description='Color', options=('blue', 'green', 'grey', 'orange'), v…

In [328]:
# the column on which the aggregation will be performed, here we count the articles for each publication accross time
col = "publication_name"

In [329]:
# rerun this cell after modifying inputs
embed('@dharpa-project/timestamped-corpus', cells=['viewof chart', 'style'], inputs={'data':prepare_viz_data(agg.value, sources, col), "timeSelected":agg.value, "userColor":color.value, "scaleType":scale.value})


- Apply filter to dataset

In [381]:
# Start from 06.06.1903 and finish 01.05.1919
date_ref_1 = "1903-6-6"
date_ref_2 = "1919-5-1"

In [383]:
query = f"SELECT * FROM sources WHERE date <= DATE '{date_ref_2}' AND date >= DATE '{date_ref_1}'"

In [384]:
filtered_df = duckdb.query(query).df()

In [385]:
filtered_df

Unnamed: 0,file_name,date,publication,publication_name
0,sn85066408_1913-03-10_ed-1_seq-1_ocr.txt,1913-03-10,sn85066408,L'Italia
1,sn85066408_1910-06-16_ed-1_seq-1_ocr.txt,1910-06-16,sn85066408,L'Italia
2,sn85066408_1915-05-09_ed-1_seq-1_ocr.txt,1915-05-09,sn85066408,L'Italia
3,sn85066408_1911-09-01_ed-1_seq-1_ocr.txt,1911-09-01,sn85066408,L'Italia
4,sn85066408_1904-11-30_ed-1_seq-1_ocr.txt,1904-11-30,sn85066408,L'Italia
...,...,...,...,...
6522,sn84037024_1917-07-24_ed-5_seq-1_ocr.txt,1917-07-24,sn84037024,La Ragione
6523,sn84037024_1917-05-05_ed-3_seq-1_ocr.txt,1917-05-05,sn84037024,La Ragione
6524,sn84037024_1917-06-30_ed-2_seq-1_ocr.txt,1917-06-30,sn84037024,La Ragione
6525,sn84037024_1917-07-24_ed-1_seq-1_ocr.txt,1917-07-24,sn84037024,La Ragione


- Visual check that the query went ok

In [392]:
embed('@dharpa-project/timestamped-corpus', cells=['viewof chart', 'style'], inputs={'data':prepare_viz_data(agg.value, filtered_df, col), "timeSelected":agg.value, "userColor":color.value, "scaleType":scale.value})


### 1.2.3 Adding text content

In [14]:
# read the content of the text files
def readTxtContent(fileName):
  with open(folder_path + fileName, 'r') as file:
    return ' ' + file.read().replace('\n', ' ') + ' '

In [15]:
# add a column to the dataframe containing file content
corpus_df['file_content'] = corpus_df['file_name'].apply(lambda x: readTxtContent(x))

In [16]:
corpus_df.to_csv(output_folder + '/documents_list.csv')

## 1.3 Removing stop words, punctuation, short words

In [17]:
import nltk

In [18]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [19]:
# add tokenized documents in dataframe
corpus_df['tokens'] = corpus_df['file_content'].apply(lambda x: nltk.word_tokenize(x))

In [20]:
# possible user options:
# .isalnum() to removes tokens that include numbers
# .isalpha() to remove all tokens that contain more than letters (punctuation and numbers)
# .isdecimal() to remove tokens that contain only decimals
# .isdigit() to remove tokens that contain only digits

# add new column in df with processed tokens (here: keeping only alpha tokens longer than 3 characters + lowercasing)
corpus_df['doc_prep'] = corpus_df['tokens'].apply(lambda x: [w.lower() for w in x if (w.isalpha() and len(w) > 2 )])

In [21]:
# link to custom stop words: https://drive.google.com/file/d/1VVfW6AKPbb7_fICOG73lEgkXmmZ6BkpC/view?usp=sharing
# Upload stop words list into Colab files before proceeding with the next cells

In [22]:
from nltk.corpus import stopwords
ital_stopwords = stopwords.words('italian')
en_stopwords = stopwords.words('english')

In [23]:
stop_words = pd.read_csv('stop_words.csv')

In [24]:
stopwords = stop_words['stopword'].values.tolist()

In [25]:
# add english stop words list to custom stopwords 
stopwords.extend(en_stopwords)

In [26]:
# to append list of words added by user: ital_stopwords.extend(user_input)
# to remove words: ital_stopwords.remove(user_input)

In [27]:
# add column with tokenized documents without sw
corpus_df['doc_prep_nostop'] = corpus_df['doc_prep'].apply(lambda x: [w for w in x if not w in stopwords])

In [28]:
corpus_df['doc_prep_nostop']

0      [numero, ebdomadario, anàrchico, propaganda, r...
1      [ebdomadario, anarchico, propaganda, rivoluzio...
2      [numero, vver, ebdomadario, anarchico, propaga...
3      [ebdomadàrio, anarchico, propaganda, rivoluzio...
4      [numero, ebdomadario, anarchico, propaganda, r...
                             ...                        
766    [nefcaper, fcas, jsc, octoberth, tbcrcfore, en...
767    [xvi, nxn, jvi, affogano, stringono, laccio, g...
768    [xvi, lynn, july, ith, num, corrispondenze, le...
769    [xvn, und, heute, geht, eine, neue, epoche, de...
770    [xvn, cose, francia, parigi, rao, villain, gil...
Name: doc_prep_nostop, Length: 771, dtype: object

In [29]:
# set the variable to use for topic modelling (if no further options are used)
corpus_model = corpus_df['doc_prep_nostop']

In [30]:
# save the data after pre-processing in the output folder for verification of pre-processing steps
# corpus_df.to_csv(output_folder + '/corpus_df.csv')
corpus_df.to_csv(output_folder + '/corpus_df.csv')

## 1.4 Lemmatize

In [31]:
# Lemmatization is available in multiple languages in Spacy and not in NLTK (only English)
# With Spacy, lemmatization is available for 10 languages. There's also a multi-language option that
# should be tested if additional languages are needed

In [38]:
!python3 -m spacy download it_core_news_sm

/bin/bash: spacy: command not found


In [39]:
import it_core_news_sm
it_nlp = it_core_news_sm.load(disable=['tagger', 'parser', 'ner'])

In [40]:
# lemmatization function
def lemmatize(doc):
  lemmatized_doc = []
  for w in doc:
    w_lemma = [token.lemma_ for token in it_nlp(w)]
    lemmatized_doc.append(w_lemma[0])
  return lemmatized_doc

In [41]:
# add column with lemmatized tokens - directly from the tokens as preprocessing has already been done
corpus_df['doc_lemmatized'] = corpus_df['doc_prep_nostop'].apply(lambda x: lemmatize(x))

In [42]:
# variable with lemmatized tokens
lemmatized_corpus = corpus_df['doc_lemmatized']
# the lemmatized version is not used in this example

# 2. Topics with LDA

In [43]:
#Gensim installation
! pip install --user gensim

Collecting gensim
  Downloading gensim-3.8.3-cp38-cp38-manylinux1_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 7.9 MB/s eta 0:00:01     |████████████████████████▊       | 18.7 MB 7.9 MB/s eta 0:00:01
Collecting scipy>=0.18.1
  Downloading scipy-1.5.4-cp38-cp38-manylinux1_x86_64.whl (25.8 MB)
[K     |████████████████████████████████| 25.8 MB 391 kB/s  eta 0:00:01
[?25hCollecting smart-open>=1.8.1
  Downloading smart_open-4.0.1.tar.gz (117 kB)
[K     |████████████████████████████████| 117 kB 46.6 MB/s eta 0:00:01
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py) ... [?25ldone
[?25h  Created wheel for smart-open: filename=smart_open-4.0.1-py3-none-any.whl size=108247 sha256=2ba362e708a3cb1cf07c93f3cea9f55db86e94ae7efdf19a58b493666d354fed
  Stored in directory: /home/jovyan/.cache/pip/wheels/8c/f9/f4/4ddd9ddee3488f48be20e9bf3108961f03ae23da29b7ed26d1
Successfully built smart-open
Installing collected packages: scip

In [45]:
import gensim
from gensim.test.utils import common_corpus, common_dictionary
from gensim import corpora, models

## 2.1 Preliminary steps to run LDA



### 2.1.1 Creating the dictionary, optional filtering of extreme values

In [46]:
# if you use lemmatized version replace "corpus_model" by lemmatized_corpus
id2word = corpora.Dictionary(corpus_model)

In [47]:
id2word.filter_extremes(no_below=5)

### 2.1.2 Bag of words

In [48]:
corpus = [id2word.doc2bow(text) for text in corpus_model]

## 2.2 LDA

### 2.2.1. Option 1: Train a model without coherence calculation



In [49]:
# set the nr of topics
num_topics = 7

In [50]:
# faster preset (hypothetically less accurate)
model = gensim.models.ldamulticore.LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, eval_every = None)
# slower preset (hypothetically more accurate)
# model = gensim.models.ldamulticore.LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, chunksize=1000, iterations = 200, passes = 10, eval_every = None)
# slowest preset (hypothetically even more accurate)
# model = gensim.models.ldamulticore.LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, chunksize=2000, iterations = 400, passes = 20, eval_every = None)


In [51]:
topic_print_model = model.print_topics(num_words=30)

In [52]:
df_topic_print_model = pd.DataFrame(topic_print_model, columns=['topic_id','words'])

In [53]:
df_topic_print_model.to_csv(output_folder +'/topics_without_coherence.csv')

### 2.2.2. Option 2: Train models and compute coherence

Topic Coherence measures score of a single topic by measuring the degree of semantic similarity between high scoring words in the topic. https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

In [54]:
from gensim.models.coherencemodel import CoherenceModel

In [None]:
# is it possible to output the models and save them somewhere?

In [56]:
%%capture
topics_nr = []
coherence_values_gensim = []
models = []
models_idx = [x for x in range(3,20)]
for num_topics in range(3, 20):
  # fastest processing time preset (hypothetically less accurate)
  model = gensim.models.ldamulticore.LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, eval_every = None)
  # slower processing time preset (hypothetically more accurate) approx 20min for 700 short docs
  # model = gensim.models.ldamulticore.LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, chunksize=1000, iterations = 200, passes = 10, eval_every = None)
  # slowest processing time preset approx 35min for 700 short docs (hypothetically even more accurate)
  # model = gensim.models.ldamulticore.LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, chunksize=2000, iterations = 400, passes = 20, eval_every = None)
  models.append(model)
  coherencemodel = CoherenceModel(model=model, texts=corpus_model, dictionary=id2word, coherence='c_v')
  coherence_value = coherencemodel.get_coherence()
  coherence_values_gensim.append(coherence_value)
  topics_nr.append(str(num_topics))

In [57]:
df_coherence = pd.DataFrame(topics_nr, columns=['Number of topics'])

In [58]:
df_coherence['Coherence'] = coherence_values_gensim

In [60]:
# save into outputs drive folder
df_coherence.to_csv(output_folder + '/coherence_values.csv')

In [61]:
# Create list with topics and topic words for each number of topics
num_topics_list = []
topics_list = []
for i in range(len(models_idx)):
  numtopics = models_idx[i]
  num_topics_list.append(numtopics)
  model = models[i]
  topic_print = model.print_topics(num_words=30)
  topics_list.append(topic_print)
  df_topic_print = pd.DataFrame(topic_print, columns=['topic_id','words'])

In [62]:
df_coherence_table = pd.DataFrame(columns=['Number of topics', 'Topic', 'Topic words'])

In [63]:
idx = 0
for i in range(len(topics_list)):
  for j in range(len(topics_list[i])):
      df_coherence_table.loc[idx] = ''
      df_coherence_table['Number of topics'].loc[idx] = num_topics_list[i]
      df_coherence_table['Topic'].loc[idx] = j + 1
      df_coherence_table['Topic words'].loc[idx] = ', '.join(re.findall(r'"(\w+)"',topics_list[i][j][1]))
      idx += 1

In [65]:
# save file to output folder
df_coherence_table.to_csv(output_folder +'/coherence_table.csv')

### 2.2.3. Choose the number of topics for further processing 

In [66]:
# select the model to display the rest of the steps by typing the desired number of topics below
numtopics = 6

In [67]:
idx = models_idx.index(numtopics)
model = models[idx]

# Topic visualisation

## Data Preparation

In [76]:
# the file import below is code reproduced from pyLDAvis module to compute calculations for terms relevance
# result of computations is then displayed in visualisation (not displayed in this notebook as will be
# handled by interface)

In [68]:
!wget https://raw.githubusercontent.com/DHARPA-Project/TopicModelling-/master/vis-files/tm_1/tm1_data_prepare_gensim.py

--2020-12-03 09:37:43--  https://raw.githubusercontent.com/DHARPA-Project/TopicModelling-/master/vis-files/tm_1/jscode-test-2.js
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 219774 (215K) [text/plain]
Saving to: ‘jscode-test-2.js’


2020-12-03 09:37:43 (1.66 MB/s) - ‘jscode-test-2.js’ saved [219774/219774]

--2020-12-03 09:37:44--  https://raw.githubusercontent.com/DHARPA-Project/TopicModelling-/master/vis-files/tm_1/tm1_data_prepare_gensim.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.112.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22824 (22K) [text/plain]
Saving to: ‘tm1_data_prepare_gensim.py’


2020-12-03 09:37:45 (2.04 MB/s) - ‘tm1_data_prep

In [74]:
%run -i tm1_data_prepare_gensim

In [77]:
data_vis = prepare(model, corpus, id2word, doc_topic_dist=None)

In [78]:
topic_info = data_vis[1][data_vis[1]['Category'] == 'Default'].copy()

In [79]:
topic_info['relevance'] = ''

In [80]:
for i in range(numtopics):
  topic_num = i+1
  data = data_vis.sorted_terms(topic=topic_num, _lambda=.6).copy()
  topic_info = topic_info.append(data)

In [81]:
# uncomment to show the full output of a topic, replace 'Topic8' by the topic to display
# topic_info[topic_info['Category'] == 'Topic8']

In [82]:
topic_info.to_csv(output_folder + '/topic_info.csv')

In [83]:
topic_proportion = data_vis[0].copy()

In [84]:
topic_proportion['proportion'] = topic_proportion['Freq'] / 100

In [85]:
topic_proportion.index.name = 'topic_id'

In [86]:
topic_proportion.to_csv(output_folder + '/topic_proportion.csv')

## Visualisation

# Topic distribution

## 3.1. Distribution per topic and per document

In [87]:
ldamodel = model

In [88]:
doc_topic_weights = ldamodel.inference(corpus)[0]
doc_topic_dists = doc_topic_weights / doc_topic_weights.sum(axis=1)[:, None]

In [89]:
distribution = pd.DataFrame(doc_topic_dists,columns=[f'Topic {i+1}' for i in range(numtopics)])

In [90]:
distribution.index.name = 'document'

In [91]:
# add file names and publication name
distribution_wfiles = distribution.copy()
distribution_wfiles.insert(0, 'document_path', corpus_df['file_name'])
distribution_wfiles.insert(1, 'publication_name', corpus_df['publication_name'])
distribution_wfiles

Unnamed: 0_level_0,document_path,publication_name,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6
document,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2012271201_1903-06-06_ed-1_seq-1_ocr.txt,Cronaca Sovversiva,0.176089,0.041301,0.398105,0.257326,0.027687,0.099492
1,2012271201_1903-06-13_ed-1_seq-1_ocr.txt,Cronaca Sovversiva,0.000375,0.274867,0.579776,0.010779,0.000375,0.133829
2,2012271201_1903-06-20_ed-1_seq-1_ocr.txt,Cronaca Sovversiva,0.150134,0.060628,0.292415,0.091584,0.304601,0.100637
3,2012271201_1903-06-27_ed-1_seq-1_ocr.txt,Cronaca Sovversiva,0.055123,0.000292,0.056557,0.715716,0.000292,0.172020
4,2012271201_1903-07-04_ed-1_seq-1_ocr.txt,Cronaca Sovversiva,0.020901,0.082106,0.009186,0.420806,0.001505,0.465496
...,...,...,...,...,...,...,...,...
766,2012271201_1918-04-20_ed-1_seq-1_ocr.txt,Cronaca Sovversiva,0.113515,0.013215,0.658471,0.174967,0.000585,0.039247
767,2012271201_1918-06-06_ed-1_seq-1_ocr.txt,Cronaca Sovversiva,0.586583,0.008119,0.078928,0.050203,0.172351,0.103817
768,2012271201_1918-07-18_ed-1_seq-1_ocr.txt,Cronaca Sovversiva,0.564316,0.047220,0.286974,0.063797,0.004725,0.032969
769,2012271201_1919-03-20_ed-1_seq-1_ocr.txt,Cronaca Sovversiva,0.531253,0.068512,0.060455,0.064128,0.072977,0.202675


## 3.2. Distribution per topic for the whole corpus

In [92]:
corpus_df['doc_length'] = corpus_df['doc_prep_nostop'].apply(lambda x: len(x))

In [93]:
topic_frequency = distribution.mul(corpus_df['doc_length'], axis=0).sum()

In [94]:
topic_proportion = (topic_frequency / topic_frequency.sum()*100).sort_values(ascending=False)

In [95]:
# dataframe with values to sort them by order of importance
distribution_df = pd.DataFrame(topic_proportion, columns=['weight'])

In [96]:
distribution_df.index.name = "topics"

In [97]:
distribution_df.to_csv(output_folder + '/topic_distribution.csv')

### 3.3. Distribution per publication


In [98]:
groupby2 = distribution_wfiles.copy()

In [99]:
distribution_title = groupby2.groupby(['publication_name']).mean().reset_index()
#distribution_title['publication'] = distribution_title['publication'].apply(lambda x: get_pub_name(x))

In [101]:
distribution_title.to_csv(output_folder + '/distribution_per_publication.csv')

# Topics weight over time

## 4.1 Aggregate document weights

In [None]:
# grouping data per month

In [102]:
distribution_wfiles['date'] = distribution_wfiles['document_path'].apply(lambda x: get_date(x))
distribution_wfiles['year'] = distribution_wfiles['document_path'].apply(lambda x: get_year(x))
distribution_wfiles['month'] = distribution_wfiles['document_path'].apply(lambda x: get_month(x))

In [103]:
groupby = distribution_wfiles.copy()

In [104]:
# drop columns that won't be needed at this stage
groupby = groupby.drop(['document_path', 'date'], axis=1)

In [105]:
# group by year 
distribution_year = groupby.groupby(['year']).mean().reset_index()

In [106]:
distribution_year

Unnamed: 0,year,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6
0,1903,0.153074,0.095651,0.269898,0.163101,0.072754,0.245523
1,1904,0.170084,0.110531,0.299678,0.162599,0.086505,0.170604
2,1905,0.177552,0.096007,0.348418,0.118845,0.063774,0.195403
3,1906,0.114664,0.138756,0.302318,0.091972,0.067197,0.285093
4,1907,0.238096,0.091207,0.258298,0.098967,0.056151,0.257281
5,1908,0.226032,0.110558,0.277202,0.103002,0.05906,0.224146
6,1909,0.139784,0.083245,0.219822,0.225718,0.147174,0.184257
7,1910,0.125533,0.140624,0.278238,0.149218,0.118446,0.187942
8,1911,0.0962,0.149219,0.256739,0.185565,0.05096,0.261317
9,1912,0.205848,0.074879,0.332184,0.152758,0.062711,0.171621


In [107]:
distribution_year.to_csv(output_folder + '/distribution_per_year.csv')


## 4.2 Yearly average of normalized weight per topic

### 4.2.1. Line chart view 

### 4.2.2. Bar chart view

### 4.2.3. Topic weights per title over time



In [108]:
groupby3 = distribution_wfiles.copy()

In [109]:
distribution_title_time = groupby3.groupby(['year','publication_name']).mean().reset_index()

In [110]:
distribution_title_time

Unnamed: 0,year,publication_name,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6
0,1903,Cronaca Sovversiva,0.153074,0.095651,0.269898,0.163101,0.072754,0.245523
1,1904,Cronaca Sovversiva,0.170084,0.110531,0.299678,0.162599,0.086505,0.170604
2,1905,Cronaca Sovversiva,0.177552,0.096007,0.348418,0.118845,0.063774,0.195403
3,1906,Cronaca Sovversiva,0.114664,0.138756,0.302318,0.091972,0.067197,0.285093
4,1907,Cronaca Sovversiva,0.238096,0.091207,0.258298,0.098967,0.056151,0.257281
5,1908,Cronaca Sovversiva,0.226032,0.110558,0.277202,0.103002,0.05906,0.224146
6,1909,Cronaca Sovversiva,0.139784,0.083245,0.219822,0.225718,0.147174,0.184257
7,1910,Cronaca Sovversiva,0.125533,0.140624,0.278238,0.149218,0.118446,0.187942
8,1911,Cronaca Sovversiva,0.0962,0.149219,0.256739,0.185565,0.05096,0.261317
9,1912,Cronaca Sovversiva,0.205848,0.074879,0.332184,0.152758,0.062711,0.171621


In [111]:
distribution_title_time.to_csv(output_folder + 'distribution_per_publication_over_time.csv')

### 4.2.4. Rolling average

#### 4.2.4.1. Rolling average by topic

In [112]:
rolling_mean = distribution_year.rolling(2).mean()

In [113]:
rolling_mean = rolling_mean[1:].copy()

In [114]:
def formatRolling(year):
  year1 = round(year - .5)
  year2 = round(year + .5)
  return "{}/{}".format(year1,year2)

In [115]:
rolling_mean['year'] = rolling_mean['year'].apply(lambda x: formatRolling(x))

In [116]:
rolling_mean

Unnamed: 0,year,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6
1,1903/1904,0.161579,0.103091,0.284788,0.16285,0.079629,0.208063
2,1904/1905,0.173818,0.103269,0.324048,0.140722,0.07514,0.183003
3,1905/1906,0.146108,0.117382,0.325368,0.105408,0.065486,0.240248
4,1906/1907,0.17638,0.114982,0.280308,0.095469,0.061674,0.271187
5,1907/1908,0.232064,0.100883,0.26775,0.100984,0.057606,0.240713
6,1908/1909,0.182908,0.096902,0.248512,0.16436,0.103117,0.204202
7,1909/1910,0.132658,0.111935,0.24903,0.187468,0.13281,0.186099
8,1910/1911,0.110867,0.144921,0.267489,0.167391,0.084703,0.224629
9,1911/1912,0.151024,0.112049,0.294461,0.169161,0.056835,0.216469
10,1912/1913,0.23087,0.089384,0.324001,0.119202,0.074429,0.162116


In [117]:
rolling_mean.to_csv(output_folder + '/rolling_mean.csv')

#### 4.2.4.1. Rolling average by publication

In [None]:
# Filtering the distribution over time by publication
# in the cell below replace 'Cronaca Sovversiva' by the publication to display
# and for the output file, each time the filter is applied on a publication, change the output file name

In [118]:
distribution_publication_time = distribution_title_time[distribution_title_time['publication_name'] == 'Cronaca Sovversiva']

In [119]:
rolling_mean_publication = distribution_publication_time.rolling(2).mean()

In [120]:
rolling_mean_publication.to_csv(output_folder + 'rolling_mean_cronaca_sovversiva.csv')