In [3]:
import json
import urllib
import os
import csv
import pandas as pd

# 0. Preliminary step to get the dataset




In [5]:
# if the filtered documents file has already been generated by previously running the notebook
# upload the file in notebook env and call it here
# run the cell that mounts google drive if you want to save output there
# then set the output folder and run the output folder cell
# after G Drive is mounted and the output folder set, go directly to step 1.3 without running the cells in between
# corpus_df = pd.read_csv('documents_list.csv')

Getting data from sharable google drive folder 

In [None]:
# link to folder: https://drive.google.com/drive/folders/18TjiltRr8CFlx0aPcLsnKBr5iyeiQxWc?usp=sharing
# upload data folder to your drive root folder 'My Drive' (It is the default folder).

In [6]:
# connect your drive to Google Colab
from google.colab import drive
drive.mount('/content/drive')
# after running this cell, open the url that displays below from your gmail account
# copy the code that is displayed 
# paste the code into the cell below when prompted and then press enter

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [7]:
folder_path = '/content/drive/My Drive/data_tm_workflow/'

In [8]:
# set the output folder in the drive (the csv files with the results of the process will be downloaded there)
# first create the empty folder in the drive
output_folder = '/content/drive/My Drive/tests2_mallet'

# 1. Data preparation

## 1.1. Creating data frame
A dataframe is first created to keep the documents at their initial state, and the name of each file

In [None]:
files_list = os.listdir(folder_path)

In [None]:
#insert file names into a df
sources = pd.DataFrame(files_list, columns=['file_name'])

## 1.2. Adding columns for dates, publications and filtering dataset

### 1.2.1. dates, publications


In [None]:
import re

In [9]:
# get publication ref from file name
def get_ref(file):
  ref_match = re.findall(r'(\w+\d+)_\d{4}-\d{2}-\d{2}_',file)
  return ref_match[0]

# get date from file name
def get_date(file):
  date_match = re.findall(r'_(\d{4}-\d{2}-\d{2})_',file)
  return date_match[0]

# get year from file name
def get_year(file):
  year_match = re.findall(r'_(\d{4})-\d{2}-\d{2}_',file)
  return year_match[0]

# get month from file name
def get_month(file):
  month_match = re.findall(r'_\d{4}-(\d{2})-\d{2}_',file)
  return month_match[0]

# get day from file name
def get_day(file):
  month_match = re.findall(r'_\d{4}-\d{2}-(\d{2})_',file)
  return month_match[0]

In [None]:
sources['date'] = sources['file_name'].apply(lambda x: get_date(x))
sources['year'] = sources['file_name'].apply(lambda x: get_year(x))
sources['month'] = sources['file_name'].apply(lambda x: get_month(x))
sources['day'] = sources['file_name'].apply(lambda x: get_day(x))
sources['publication'] = sources['file_name'].apply(lambda x: get_ref(x))

In [None]:
# add publication names
def get_pub_name(pub_number):
    if (pub_number == 'sn85066408'):
        return 'L\'Italia'
    elif (pub_number == '2012271201'):
        return 'Cronaca Sovversiva'

In [None]:
sources['publication_name'] = sources['publication'].apply(lambda x: get_pub_name(x))

### 1.2.2. Filtering

In [None]:
import datetime
from datetime import timedelta, date

In [None]:
# Start from 06.06.1903 and finish 01.05.1919
date_ref_1 = date(1903,6,6)
date_ref_2 = date(1919,5,1)

In [None]:
def filter_by_date(date_item,date_ref_1,date_ref_2):
    year = re.findall(r'(\d{4})-\d{2}-\d{2}',date_item)[0]
    month = re.findall(r'\d{4}-(\d{2})-\d{2}',date_item)[0]
    day = re.findall(r'\d{4}-\d{2}-(\d{2})',date_item)[0]
    file_date = date(int(year),int(month),int(day))
    if (date_ref_1 <= file_date <= date_ref_2):
        return 'included'
    else:
        return 'not included'

In [None]:
sources['status'] = sources['date'].apply(lambda x: filter_by_date(x,date_ref_1,date_ref_2))

In [None]:
# variable containing the filtered documents
corpus_df = sources[sources['status'] == 'included'].copy().reset_index()

### 1.2.3 Adding text content

In [None]:
# read the content of the text files
def readTxtContent(fileName):
  with open(folder_path + fileName, 'r') as file:
    return ' ' + file.read().replace('\n', ' ') + ' '

In [None]:
# add a column to the dataframe containing file content
corpus_df['file_content'] = corpus_df['file_name'].apply(lambda x: readTxtContent(x))

In [None]:
corpus_df.to_csv(output_folder + '/documents_list.csv')

## 1.3 Removing stop words, punctuation, short words

In [10]:
! pip install nltk



In [11]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
# add tokenized documents in dataframe
corpus_df['tokens'] = corpus_df['file_content'].apply(lambda x: nltk.word_tokenize(x))

In [13]:
# possible user options:
# .isalnum() to removes tokens that include numbers
# .isalpha() to remove all tokens that contain more than letters (punctuation and numbers)
# .isdecimal() to remove tokens that contain only decimals
# .isdigit() to remove tokens that contain only digits

# add new column in df with processed tokens (here: keeping only alpha tokens longer than 3 characters + lowercasing)
corpus_df['doc_prep'] = corpus_df['tokens'].apply(lambda x: [w.lower() for w in x if (w.isalpha() and len(w) > 2 )])

In [None]:
# link to custom stop words: https://drive.google.com/file/d/1VVfW6AKPbb7_fICOG73lEgkXmmZ6BkpC/view?usp=sharing
# Upload stop words list into Colab files before proceeding with the next cells

In [14]:
from nltk.corpus import stopwords
ital_stopwords = stopwords.words('italian')
en_stopwords = stopwords.words('english')

In [15]:
stop_words = pd.read_csv('stop_words.csv')

In [16]:
stopwords = stop_words['stopword'].values.tolist()

In [17]:
# add english stop words list to custom stopwords 
stopwords.extend(en_stopwords)

In [None]:
# to append list of words added by user: ital_stopwords.extend(user_input)
# to remove words: ital_stopwords.remove(user_input)

In [18]:
# add column with tokenized documents without sw
corpus_df['doc_prep_nostop'] = corpus_df['doc_prep'].apply(lambda x: [w for w in x if not w in stopwords])

In [19]:
corpus_df['doc_prep_nostop']

0      [nura, dtfip, verrmo, assioma, auanto, barba, ...
1      [saturday, january, barre, vermont, sabato, ge...
2      [annj, xii, iodio, mite, umberto, santo, costi...
3      [vili, entered, brusco, svolto, storia, partit...
4      [scoi, ietti, lez, actf, àttof, coij, peri, it...
                             ...                        
766    [ionn, imperversa, giungono, spiaggie, patria,...
767    [ass, ittc, ass, tìii, eie, alt, sss, lynn, au...
768    [num, xii, magnifico, pertinacia, levato, pean...
769    [ann, rhe, fcr, rivo, ragioni, economiche, sfa...
770    [newspapet, objection, anvo, nura, mmmmmmm, an...
Name: doc_prep_nostop, Length: 771, dtype: object

In [20]:
# set the variable to use for topic modelling (if no further options are used)
corpus_model = corpus_df['doc_prep_nostop']

In [21]:
# save the data after pre-processing in the output folder for verification of pre-processing steps
# corpus_df.to_csv(output_folder + '/corpus_df.csv')
corpus_df.to_csv(output_folder + '/corpus_df.csv')

## 1.4 Lemmatize

In [None]:
# Lemmatization is available in multiple languages in Spacy and not in NLTK (only English)
# With Spacy, lemmatization is available for 10 languages. There's also a multi-language option that
# should be tested if additional languages are needed

In [None]:
!pip install spacy



In [None]:
!python3 -m spacy download it_core_news_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('it_core_news_sm')


In [None]:
import it_core_news_sm
it_nlp = it_core_news_sm.load(disable=['tagger', 'parser', 'ner'])

In [None]:
# lemmatization function
def lemmatize(doc):
  lemmatized_doc = []
  for w in doc:
    w_lemma = [token.lemma_ for token in it_nlp(w)]
    lemmatized_doc.append(w_lemma[0])
  return lemmatized_doc

In [None]:
# add column with lemmatized tokens - directly from the tokens as preprocessing has already been done
corpus_df['doc_lemmatized'] = corpus_df['doc_prep_nostop'].apply(lambda x: lemmatize(x))

In [None]:
# variable with lemmatized tokens
lemmatized_corpus = corpus_df['doc_lemmatized']
# the lemmatized version is not used in this example

# 2. Topics with LDA Mallet

In [22]:
#Gensim installation
! pip install gensim



In [23]:
%%capture
# installation setup that works for Mallet: https://github.com/polsci/colab-gensim-mallet/blob/master/topic-modeling-with-colab-gensim-mallet.ipynb
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

In [24]:
%%capture
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
!unzip mallet-2.0.8.zip

In [25]:
os.environ['MALLET_HOME'] = '/content/mallet-2.0.8'
mallet_path = '/content/mallet-2.0.8/bin/mallet'

In [26]:
import gensim
from gensim.test.utils import common_corpus, common_dictionary
from gensim import corpora, models
from gensim.models.wrappers import LdaMallet

## 2.1 Preliminary steps to run LDA



### 2.1.1 Creating the dictionary, optional filtering of extreme values

In [27]:
# if you use lemmatized version replace "corpus_model" by lemmatized_corpus
id2word = corpora.Dictionary(corpus_model)

In [28]:
id2word.filter_extremes(no_below=5)

### 2.1.2 Bag of words

In [29]:
corpus = [id2word.doc2bow(text) for text in corpus_model]

## 2.2 LDA

In [77]:
# set the nr of topics
num_topics = 2

In [None]:
# compare results between Mallet and Mallet translated back to Gensim

In [78]:
model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [79]:
# translate back mallet model into gensim format
model2 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model)

In [83]:
import re

In [None]:
#the first array number is the topic id for which printing the topic words

In [93]:
match_w0 = re.findall(r'"(\w+)"',model.print_topics(num_words=150)[1][1])

In [94]:
match_w1 = re.findall(r'"(\w+)"',model2.print_topics(num_words=150)[1][1])

In [95]:
df_topic0 = pd.DataFrame(match_w0, columns=['words_topic_0_mallet'])

In [96]:
df_topic0 = df_topic0.sort_values(['words_topic_0_mallet']).reset_index().copy()

In [97]:
df_topic1 = pd.DataFrame(match_w1, columns=['words_topic_0_mallet_to_gensim'])

In [98]:
df_topic1 = df_topic1.sort_values(['words_topic_0_mallet_to_gensim']).reset_index().copy()

In [99]:
df_topics = pd.concat([df_topic0,df_topic1],axis=1)

In [100]:
df_topics.to_csv(output_folder +'/topic_words_1.csv')