In [0]:
import json
import requests
import urllib

# 0. Preliminary step to get sample data



This preliminary step is reproducing Lorella's workflow Python file:
https://i-lab.public.data.uu.nl/vault-ocex/ChroniclItaly%20-%20Italian%20American%20newspapers%20corpus%20from%201898%20to%201920%5B1529330521%5D/original/
I just added a folder "data_1" to keep all files in one folder

In [0]:
mkdir 'data1'

In [0]:
# Base URL
chronam = 'https://chroniclingamerica.loc.gov/'

# Chronicling America search results
results = 'https://chroniclingamerica.loc.gov/search/pages/results/?date1=1880&date2=1920&searchType=advanced&language=ita&sequence=1&lccn=2012271201&lccn=sn85066408&lccn=sn85055164&lccn=sn85054967&lccn=sn88064299&lccn=sn84037024&lccn=sn84037025&lccn=sn86092310&proxdistance=5&state=California&state=District+of+Columbia&state=Massachusetts&state=Pennsylvania&state=Piedmont&state=Vermont&state=West+Virginia&rows=100&ortext=&proxtext=&phrasetext=&andtext=&dateFilterType=yearRange&page=11&sort=date'

# Count to keep track of downloaded files
count = 0

# Gets search results in JSON format
results_json = results + '&format=json'


In [0]:
# Returns JSON 
def get_json(url):
    data = requests.get(url)
    return(json.loads(data.content))
    
data = get_json(results_json)

In [0]:
files_list = []
# Cycle through JSON results
for page in data['items']:
    # Create URL
    hit = str(page['id'])
    seed = hit + 'ocr.txt'
    download_url = chronam + seed
 
    # Create file name
    file_name = download_url.replace('/', '_')
    files_list.append(file_name[41:])
    file_name = 'data1/' + file_name[41:]

    # Download .txt of the page
    urllib.request.urlretrieve(download_url, str(file_name))
    count += 1

# 1. Data preparation

## 1.1. Creating data frame
A dataframe is first created to keep the documents at their initial state, and the name of each file

In [0]:
import os
import pandas as pd

In [0]:
#insert file names into a df
sources = pd.DataFrame(files_list, columns=['file_name'])

In [0]:
#function to read the content of the text files
def readTxtContent(fileName):
  with open('data1/' + fileName, 'r') as file:
    return ' ' + file.read().replace('\n', ' ') + ' '

In [0]:
# adding a column to the dataframe containing file content
sources['file_content'] = sources['file_name'].apply(lambda x: readTxtContent(x))

In [0]:
# variable containing the documents separately
corpus = sources['file_content']

## 1.2 Removing stop words, punctuation, short words

In [0]:
%%capture
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [0]:
# add tokenized documents in dataframe
sources['tokens'] = sources['file_content'].apply(lambda x: nltk.word_tokenize(x))

In [0]:
# add new column in df with processed tokens
sources['tokens_prep'] = sources['tokens'].apply(lambda x: [w.lower() for w in x if (w.isalnum() and len(w) > 3 )])

In [0]:
# these lines are useful if we want to provide alternate stop words lists (NLTK)
# show list of default NLTK Italian stopwords
# stopwords.words('italian')
# ital_stopwords = stopwords.words('italian')
# to append list of words added by user: ital_stopwords.extend(user_input)
# to remove words: ital_stopwords.remove(user_input)

In [0]:
# spacy list of Stop words (seems to be more complete than NLTK)
import spacy
from spacy.lang.it.stop_words import STOP_WORDS

In [0]:
spacy_it_sw = STOP_WORDS

In [0]:
# add column with tokenized documents without sw
sources['tokens_prep_nostop'] = sources['tokens_prep'].apply(lambda x: [w for w in x if not w in spacy_it_sw])

## 1.3 Stem

In [0]:
from nltk.stem.snowball import SnowballStemmer

In [0]:
#initialize with needed language
stemmer = SnowballStemmer("italian")

In [0]:
# add column with stemmed tokens
sources['tokens_stemmed'] = sources['tokens_prep_nostop'].apply(lambda x: [stemmer.stem(w) for w in x])

## 1.4 Lemmatize

In [0]:
# Lemmatization is available in multiple languages in Spacy and not in NLTK (only English)
# With Spacy, lemmatization is available for 10 languages. There's also a multi-language option that
# should be tested if additional languages are needed

In [0]:
%%capture
!python -m spacy download it_core_news_sm

In [0]:
import it_core_news_sm
it_nlp = it_core_news_sm.load(disable=['tagger', 'parser', 'ner'])

In [0]:
# lemmatization function
def lemmatize(doc):
  lemmatized_doc = []
  for w in doc:
    w_lemma = [token.lemma_ for token in it_nlp(w)]
    lemmatized_doc.append(w_lemma[0])
  return lemmatized_doc

In [0]:
# add column with lemmatized tokens
sources['tokens_lemmatized'] = sources['tokens_prep_nostop'].apply(lambda x: lemmatize(x))

In [0]:
# variable with lemmatized tokens
lemmatized_corpus = sources['tokens_lemmatized']

# 2. Topics with LDA

In [0]:
#Gensim installation
import gensim
from gensim.test.utils import common_corpus, common_dictionary
from gensim import corpora, models
from gensim.models.wrappers import LdaMallet

## 2.1 Preliminary steps to run LDA



### 2.1.1 Bag of words

In [0]:
# dataset = [d.split() for d in lemmatized_corpus] (this is not useful if lemmatized version is used)
# Create Dictionary
# change "lemmatized_corpus" variable by stemmed_corpus or tokenized_corpus_without_sw depending
# on which version you would like to work with 
id2word = corpora.Dictionary(lemmatized_corpus)
corpus = [id2word.doc2bow(text) for text in lemmatized_corpus]

## 2.2 LDA

In [0]:
# set the number of topics here
numtopics = 6

### 2.2.1 LDA with Gensim

# Topic distribution

## 3.1. Gensim

### 3.1.1. Distribution per topic and per document

In [0]:
%%capture
numtopics = 6
ldamodel = models.LdaModel(corpus, num_topics=numtopics, id2word = id2word, minimum_probability=0)

In [0]:
distribution_gensim = pd.DataFrame(index=[i for i in range(len(corpus))],columns=[f'topic{i}' for i in range(numtopics)])

In [0]:
for i in range(len(corpus)):
  row = [list(j)[1] for j in ldamodel[corpus[i]]] 
  distribution_gensim.loc[i] = row

In [63]:
distribution_gensim.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
topic0,0.200715,0.000104874,0.0214167,0.000149461,0.633773,0.0514079,0.000101166,0.202157,0.00015825,0.000141362,0.00011418,0.931529,0.563761,0.999229,0.107813,0.578198,0.000179985,0.00016666,0.759857,0.000680846,0.566005,0.00277465,0.442871,0.000104726,0.000109333,0.663079,0.000111504,0.000107933,0.013697,0.0214633,0.000104422,0.710733,0.0820566,0.00385791,0.00010588,0.000255444,0.00550124,0.501782,9.46817e-05,0.0145612,...,0.00493372,0.00030783,0.383505,0.000214499,9.67473e-05,0.498758,0.00015111,0.000179445,0.000111696,0.000105766,0.722212,0.00894928,0.0906433,0.0348118,0.165303,0.0759857,0.000106905,0.000221516,0.0793873,0.849668,0.732288,0.464617,0.000176442,0.32623,0.000236339,0.998633,0.0918905,0.142173,0.000142429,0.0030699,0.0406691,0.574278,0.0611652,0.16947,0.999293,0.0622337,0.192088,0.0134869,0.0661127,0.995881
topic1,0.622205,0.000104765,0.000142621,0.000149474,0.000234896,0.000105171,0.000274389,0.442115,0.853836,0.000141442,0.786843,0.000120074,0.246771,0.000154197,0.000134926,0.000154409,0.000255735,0.000166652,0.00143098,0.000115893,0.0426166,0.000124307,0.000144503,0.000104805,0.000546028,0.00449452,0.000111435,0.200465,0.00314348,0.00013867,0.000104326,0.0943983,0.098822,0.0023702,0.0111888,0.000255269,0.00198414,0.000113951,0.00457145,0.0505699,...,0.0484838,0.0717108,0.000106482,0.000214507,9.64855e-05,0.000173563,0.000151005,0.00017963,0.197682,0.00357012,0.0555576,0.304128,0.000112535,0.550106,0.37387,0.000109204,0.0648306,0.000221845,0.000110042,0.000157143,0.000182468,0.000188984,0.000176488,0.000101425,0.998817,0.000138012,0.000148104,0.000141018,0.000142331,0.000118447,0.000110231,0.000223587,0.000109965,0.00010552,0.000141379,0.000447436,0.00703093,0.00012008,0.000239704,0.000191248
topic2,0.000118055,0.000104804,0.977968,0.00014952,0.000234696,0.012113,0.000101096,0.000111592,0.000158174,0.000141375,0.108928,0.000119962,0.000188073,0.000154422,0.0228234,0.00601999,0.00118674,0.000166823,0.00424939,0.0046214,0.00311866,0.000124304,0.0067673,0.000104742,0.9245,0.000144214,0.000111509,0.109919,0.0118918,0.000139002,0.274614,0.12445,0.793785,0.462487,0.000105893,0.833334,0.982828,0.445544,0.0124794,0.37516,...,9.86427e-05,0.000656476,0.0119898,0.962922,0.855569,0.0387493,0.114217,0.000179169,0.000111571,0.0243521,0.0555574,0.406002,0.0054043,0.000107294,0.0659867,0.000109205,0.021948,0.000221874,0.000110039,0.000157329,0.00018252,0.000187425,0.000176418,0.672263,0.00023714,0.00013803,0.000148121,0.000141242,0.000142415,0.000118479,0.000111716,0.000223551,0.817231,0.279186,0.000141458,0.510136,0.800483,0.975505,0.2538,0.00175483
topic3,0.00866785,0.000104762,0.000167956,0.000149516,0.365288,0.000105197,0.956747,0.350184,0.145532,0.998152,0.00032878,0.000120091,0.188904,0.000154223,0.868958,0.397009,0.428009,0.999166,0.00106345,0.99435,0.344766,0.993205,0.549932,0.000104817,0.0217751,0.331994,0.999442,0.688836,0.894328,0.977982,0.000104408,0.0114559,0.000281002,0.221266,0.987387,0.000254896,0.00011257,0.0523321,0.0310694,0.324833,...,0.946286,0.464636,0.0348628,0.00021432,9.65242e-05,0.0641193,0.421544,0.999103,0.801871,0.874486,0.0555573,0.00211771,0.884237,0.414759,0.317364,0.00482867,0.0033523,0.00255992,0.902134,0.149703,0.265344,0.0388645,0.999118,0.000244316,0.000236552,0.000815163,0.907517,0.0956058,0.186303,0.000118443,0.958889,0.000223785,0.0194623,0.000872557,0.00014165,0.0231382,0.000132452,0.008054,0.000239463,0.000128321
topic4,0.168176,0.999476,0.000162622,0.999253,0.000234844,0.000105206,0.00848555,0.00532106,0.000158244,0.00128221,0.0545938,0.067991,0.000187832,0.000154111,0.000135034,0.0184646,0.567609,0.000166756,0.233293,0.000115938,0.0419061,0.00364765,0.000142293,0.999476,0.0529606,0.000144181,0.000111632,0.000107624,0.0768336,0.000138654,0.724968,0.0171129,0.0249427,0.0103529,0.00110651,0.165645,0.00946164,0.000113958,0.95169,0.234766,...,9.87111e-05,0.000110202,0.000224852,0.000214218,0.0194741,0.00323201,0.344419,0.00017922,0.000111621,0.00251297,0.0555575,0.0855567,0.0194902,0.000109182,0.0367098,0.918858,0.897215,0.000221441,0.00408843,0.000157148,0.00182016,0.000187378,0.000176677,0.00105922,0.000236407,0.000138015,0.000148171,0.761798,0.000142387,0.000118332,0.000110253,0.424828,0.10088,0.280024,0.000141533,0.403598,0.000132446,0.00271425,0.000239202,0.000128372
topic5,0.000117928,0.000104817,0.000142532,0.000149447,0.00023474,0.936164,0.0342905,0.0001115,0.0001582,0.000141343,0.049193,0.000119932,0.000188066,0.000154054,0.000134971,0.000154226,0.00275956,0.00016663,0.000106196,0.00011588,0.00158694,0.000124262,0.000142235,0.000104823,0.000109259,0.000144178,0.000111606,0.000563637,0.000106037,0.000138554,0.00010439,0.0418502,0.000112504,0.299666,0.000105898,0.000255027,0.000112505,0.000113965,9.46597e-05,0.000109344,...,9.87442e-05,0.462579,0.569312,0.03622,0.124667,0.394967,0.119518,0.000179212,0.000111618,0.0949728,0.0555577,0.193247,0.000112641,0.000107309,0.0407661,0.000109123,0.0125472,0.996553,0.0141697,0.000157314,0.000182733,0.495955,0.000176402,0.000101462,0.000236548,0.000138033,0.000147954,0.000141177,0.813127,0.996456,0.000110129,0.000223571,0.00115157,0.270342,0.00014132,0.000447003,0.000132332,0.000119908,0.679369,0.00191647


## Distribution with pyldavis

In [0]:
def _df_with_names(data, index_name, columns_name):
    if type(data) == pd.DataFrame:
        # we want our index to be numbered
        df = pd.DataFrame(data.values)
    else:
        df = pd.DataFrame(data)
    df.index.name = index_name
    df.columns.name = columns_name
    return df

In [0]:
#PylDavis:
gamma, _ = ldamodel.inference(corpus)

In [0]:
doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]

In [0]:
doc_topic_dists = _df_with_names(doc_topic_dists, 'doc', 'topic')

In [70]:
doc_topic_dists.T

doc,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
0,0.184217,0.000105,0.024778,0.000149,0.599923,0.063903,0.000101,0.223917,0.000158,0.000141,0.000114,0.928136,0.569921,0.999229,0.094486,0.599196,0.00018,0.000167,0.77521,0.000795,0.619593,0.002406,0.377671,0.000105,0.000109,0.633698,0.000112,0.000108,0.014978,0.028162,0.000104,0.737165,0.073308,0.007469,0.000106,0.000255,0.004179,0.511988,9.5e-05,0.015509,...,0.007591,0.000293,0.4066,0.000215,9.7e-05,0.491348,0.000151,0.000179,0.000112,0.000104,0.722213,0.007307,0.126047,0.038073,0.179473,0.073202,0.000107,0.000222,0.108024,0.83474,0.732358,0.481698,0.000176,0.302254,0.000236,0.99872,0.086501,0.200538,0.000142,0.003686,0.049021,0.548942,0.082179,0.137122,0.999279,0.045576,0.147125,0.008892,0.080554,0.996742
1,0.622516,0.000105,0.000143,0.000149,0.000235,0.000105,0.000302,0.419463,0.868637,0.000141,0.778765,0.00012,0.269604,0.000154,0.000135,0.000154,0.000284,0.000167,0.00131,0.000116,0.040717,0.000124,0.000971,0.000105,0.000285,0.004716,0.000111,0.23251,0.004279,0.000139,0.000104,0.096283,0.090113,0.002007,0.012797,0.000255,0.002906,0.000114,0.003515,0.051934,...,0.050117,0.070648,0.000106,0.000215,9.6e-05,0.000174,0.000151,0.00018,0.172137,0.003543,0.055558,0.306116,0.000113,0.526159,0.353849,0.000109,0.04469,0.000222,0.00011,0.000157,0.000182,0.000188,0.000176,0.000101,0.998817,0.000138,0.000148,0.000141,0.000142,0.000118,0.00011,0.000224,0.00011,0.000106,0.000141,0.000447,0.007328,0.00012,0.00024,0.00013
2,0.000118,0.000105,0.974651,0.00015,0.000235,0.011175,0.000101,0.000112,0.000158,0.000141,0.093149,0.00012,0.000188,0.000154,0.020988,0.010743,0.002552,0.000167,0.00381,0.004093,0.002081,0.000124,0.007571,0.000105,0.925925,0.000144,0.000112,0.126472,0.009394,0.000153,0.249879,0.114988,0.816382,0.470766,0.000106,0.823706,0.986363,0.432108,0.010046,0.415771,...,9.9e-05,0.000717,0.009394,0.967063,0.854796,0.042445,0.128518,0.000179,0.000112,0.018185,0.055557,0.454784,0.007993,0.000107,0.096262,0.000109,0.021554,0.000292,0.00011,0.000157,0.000183,0.000187,0.000176,0.695628,0.000237,0.000138,0.000148,0.000141,0.000142,0.000118,0.000112,0.000224,0.79988,0.302761,0.000141,0.502662,0.84515,0.981044,0.247406,0.001595
3,0.01206,0.000105,0.000143,0.00015,0.399138,0.000105,0.957549,0.353776,0.13073,0.998605,0.000895,0.00012,0.15991,0.000154,0.884121,0.374734,0.49098,0.999166,0.00074,0.994765,0.298437,0.988642,0.613502,0.000105,0.020843,0.361153,0.999442,0.639608,0.900166,0.971269,0.000104,0.013959,0.000126,0.2288,0.985314,0.000255,0.000113,0.055562,0.038366,0.301689,...,0.941996,0.480803,0.023848,0.000214,9.7e-05,0.06481,0.448806,0.999103,0.827416,0.896835,0.055557,0.001053,0.836969,0.435446,0.292168,0.003145,0.002484,0.003289,0.873151,0.164631,0.264532,0.04234,0.999117,0.000295,0.000237,0.000729,0.912906,0.104911,0.200923,0.000118,0.950537,0.000224,0.018174,0.000679,0.000155,0.046521,0.000132,0.007571,0.000239,0.000128
4,0.18097,0.999476,0.000143,0.999252,0.000235,0.000105,0.009126,0.00262,0.000158,0.00083,0.064577,0.071384,0.000188,0.000154,0.000135,0.015018,0.501807,0.000167,0.218823,0.000116,0.038181,0.008579,0.000142,0.999476,0.052728,0.000144,0.000112,0.000108,0.071077,0.000139,0.749703,0.009414,0.019959,0.007263,0.001571,0.175274,0.006327,0.000114,0.947884,0.214909,...,9.9e-05,0.00011,0.000271,0.000214,0.016022,0.005311,0.296239,0.000179,0.000112,0.001641,0.055557,0.0582,0.028705,0.000108,0.039217,0.923326,0.913474,0.000221,0.003801,0.000157,0.002562,0.000187,0.000177,0.00162,0.000236,0.000138,0.000148,0.694127,0.000142,0.000118,0.00011,0.450163,0.097655,0.336858,0.000142,0.404346,0.000132,0.002252,0.000239,0.000128
5,0.000118,0.000105,0.000143,0.000149,0.000235,0.924606,0.032821,0.000111,0.000158,0.000141,0.062501,0.00012,0.000188,0.000154,0.000135,0.000154,0.004197,0.000167,0.000106,0.000116,0.00099,0.000124,0.000142,0.000105,0.000109,0.000144,0.000112,0.001195,0.000106,0.000139,0.000104,0.028191,0.000113,0.283695,0.000106,0.000255,0.000113,0.000114,9.5e-05,0.000188,...,9.9e-05,0.447429,0.55978,0.032079,0.128892,0.395913,0.126134,0.000179,0.000112,0.079691,0.055558,0.172539,0.000174,0.000107,0.039032,0.000109,0.017691,0.995755,0.014804,0.000157,0.000183,0.475399,0.000176,0.000101,0.000237,0.000138,0.000148,0.000141,0.798508,0.99584,0.00011,0.000224,0.002002,0.222474,0.000141,0.000447,0.000132,0.00012,0.671321,0.001276


## Topic weights

### PYLdavis

In [0]:
corpus_csc = gensim.matutils.corpus2csc(corpus, num_terms=len(id2word))

In [0]:
doc_lengths = corpus_csc.sum(axis=0).A.ravel()

In [0]:
# proportion
topic_freq = (doc_topic_dists.T * doc_lengths).T.sum()
topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False)

In [77]:
topic_proportion

topic
3    0.324526
0    0.188651
2    0.175376
4    0.142826
5    0.102052
1    0.066568
dtype: float32

### Gensim

In [79]:
topics = []
topic_weights = []
for i in range(len(distribution_gensim.columns)):
  topic_weight = round(sum(distribution_gensim.iloc[:,i]),2)
  topic_weights.append(topic_weight)
  print( f'topic {i+1}: {topic_weight}%' )
  topic = ldamodel.print_topics()[i][1]
  topics.append(i)
  print( f'terms: {topic} ')

topic 1: 20.32%
terms: 0.009*"italiano" + 0.004*"italia" + 0.004*"telegrafico" + 0.004*"dispaccio" + 0.003*"giornale" + 0.003*"venire" + 0.003*"potere" + 0.003*"daily" + 0.002*"roma" + 0.002*"mentire" 
topic 2: 7.05%
terms: 0.005*"italiano" + 0.004*"roma" + 0.004*"dispaccio" + 0.003*"italia" + 0.003*"venire" + 0.003*"daily" + 0.002*"potere" + 0.002*"telegrafico" + 0.002*"giornale" + 0.002*"dire" 
topic 3: 17.13%
terms: 0.008*"italiano" + 0.003*"potere" + 0.003*"italia" + 0.003*"telegrafico" + 0.003*"roma" + 0.003*"dispaccio" + 0.003*"giornale" + 0.003*"daily" + 0.003*"venire" + 0.002*"cardinale" 
topic 4: 30.52%
terms: 0.008*"italiano" + 0.005*"italia" + 0.004*"dispaccio" + 0.004*"roma" + 0.003*"venire" + 0.003*"telegrafico" + 0.002*"giornale" + 0.002*"daily" + 0.002*"potere" + 0.002*"cardinale" 
topic 5: 13.77%
terms: 0.007*"italiano" + 0.004*"italia" + 0.004*"dispaccio" + 0.004*"roma" + 0.003*"telegrafico" + 0.003*"potere" + 0.003*"venire" + 0.002*"daily" + 0.002*"giornale" + 0.002*"

In [0]:
# pyLDAvis installation
%%capture
! pip install pyldavis
import pyLDAvis
import pyLDAvis.gensim

In [0]:
%%capture
numtopics = 6
selected_model = models.LdaModel(corpus, id2word=id2word, num_topics=6, minimum_probability=0)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(selected_model, corpus, id2word)

In [86]:
vis