In [1]:
import gensim
from gensim import corpora
import pandas as pd
import json
import os
import ast
import re 
import pyLDAvis.gensim 
import pyLDAvis

### Opening Corpus 


In [2]:
df= pd.read_csv('final_corpus.csv',converters={'tokenized_lemmas': ast.literal_eval,'tokenized_norm':ast.literal_eval})

In [3]:
#removing all one word items in the dataset
mask = [len(sent) > 2 for sent in df['tokenized_lemmas']]
df = df[mask]

#removing unwanted col 
df = df.drop(['Unnamed: 0.1','Unnamed: 0'],axis=1)
df.head(1)

Unnamed: 0,period,author,title,text,tokenized_text,lemma,tokenized_lemmas
0,Late_Antiquity,Agennius Ubricus,De Controuersiis Agrorum,aduersantur ne quid in rerum natura finitum ...,"['aduersantur', 'ne', 'quid', 'in', 'rerum', '...",aduersor ne quis in res natura finio sum uideo .,"[aduersor, ne, quis, in, res, natura, finio, s..."


In [4]:
with open('stopwords_latin_lemmas.json','r',encoding='utf-8') as f: 
    stopwords = json.load(f)   

In [5]:
print(df['tokenized_lemmas'][0])
for sents in df['tokenized_lemmas']: 
    for i in range(len(sents)-1,-1,-1): 
        if sents[i] in stopwords:
            sents.remove(sents[i])
print(df['tokenized_lemmas'][0])

['aduersor', 'ne', 'quis', 'in', 'res', 'natura', 'finio', 'sum', 'uideo', '.']
['aduersor', 'natura', 'finio']


# Training and Visualizing Models 

In [None]:
def LDA_vis(era, path):    
    #getting tokenized_lemmas in specfied era
    texts = df.loc[df['period'].isin(era)]['tokenized_lemmas']
    #getting ids for vocab
    id2word = corpora.Dictionary(texts)
    
    #getting term-document frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    
    #building LDA model 
    num_topic = 15
    
    lda_model = gensim.models.LdaMulticore(corpus = corpus,
                                           id2word = id2word, 
                                           num_topics = num_topic, 
                                           passes = 50)
    
    lda_model.save(name)

    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

    return vis, lda_model

# Late Republican Era

In [19]:
LR_model = LDA_vis(['Late_Republican_Era'],"Late_Republic_LDA")
pyLDAvis.enable_notebook()
pyLDAvis.display(LR_model[0])


  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  return node.n
  if isinstance(node, ast.Num):  # <number>
  return node.n


# The Principate

In [None]:
TP_model = LDA_vis(['The_Principate'], "The_Principate_LDA")
pyLDAvis.enable_notebook()
pyLDAvis.display(TP_model[0])

  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  return node.n
  if isinstance(node, ast.Num):  # <number>
  return node.n


# Late Antiquity

In [None]:
LA_model = LDA_vis(['Late_Antiquity'],"Late_Antiquity_LDA")
pyLDAvis.enable_notebook()
pyLDAvis.display(LA_model[0])

  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  return node.n
  if isinstance(node, ast.Num):  # <number>
  return node.n


# All Periods

In [None]:
all_periods_model = LDA_vis(['Late_Repubican_Era','The_Principate','Late_Antiquity'], 'All_period_LDA')
pyLDAvis.enable_notebook()
pyLDAvis.display(all_periods_model[0])

  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  return node.n
  if isinstance(node, ast.Num):  # <number>
  return node.n


# Topic Vocab

In [11]:
def topic_vocab(model):
    x = model[1].show_topics(num_topics=15,num_words= 10,formatted=False)

    topics = []
    for i in range(len(x)): 
        y = re.findall('[a-z]+',str(x[i][1]))
        topics.append(y)

    return topics

In [12]:
all_period = topic_vocab(all_periods_model)
LR_topic_vocab =  topic_vocab(LR_model)
TP_topic_vocab = topic_vocab(TP_model)
LA_topic_vocab = topic_vocab(LA_model)

In [13]:
num_topics = 15
LR_topics = LR_model[1].show_topics(num_topics= num_topics,num_words= 30,formatted=False)
TP_topics = TP_model[1].show_topics(num_topics=num_topics,num_words= 30,formatted=False)
LA_topics = LA_model[1].show_topics(num_topics = num_topics,num_words= 30,formatted=False)

def formatting(topics):
    elements = []
    for tup in topics: 
        num = tup[0]
        lst = tup[1]
        word_list = []
        for word in lst:
            word_list.append(word[0])\
    
        elements.append((num,word_list))

    return elements

LR_list = formatting(LR_topics)
TP_list = formatting(TP_topics)
LA_list = formatting(LA_topics) 



In [14]:
def j_dissim(word1,word2):
  wordlist1 = (word1)
  wordlist2 = (word2)
  intersection = set(wordlist1).intersection(set(wordlist2))
  union = set(wordlist1).union(set(wordlist2))
  dissim = 1 - (len(intersection) / len(union))
  return dissim # len(intersection),len(union)

In [15]:
def list_comp(topics_a,topics_b): 
    relations = []
    for x in topics_a:
        element_a = x[0]
        list_a = x[1]
        for y in topics_b: 
            element_b = y[0]
            list_b = y[1]
            distance = j_dissim(list_a,list_b)
            relations.append((element_a,element_b,distance))
            relations = sorted(relations, key= lambda x: x[2])
    return relations

In [16]:
relations_1 = list_comp(LR_list,TP_list) 
relations_2 = list_comp(TP_list,LA_list)

final= []

for rel1 in relations_1: 
    end = rel1[1]
    for rel2 in relations_2: 
        start = rel2[0]
        if end == start: 
            total_dist = rel1[2] + rel2[2]
            final.append((rel1[0],rel1[1],rel2[1],total_dist))

        else: 
            continue

combos = sorted(final, key= lambda x: x[3])

print('(t1_topic_id, t2_topic_id, t3_topic_id, total distance)')
combos[:30]


(t1_topic_id, t2_topic_id, t3_topic_id, total distance)


[(13, 3, 2, 1.6047430830039526),
 (13, 3, 13, 1.6047430830039526),
 (12, 6, 2, 1.6216640502354789),
 (13, 3, 0, 1.624223602484472),
 (13, 3, 6, 1.624223602484472),
 (13, 3, 12, 1.624223602484472),
 (13, 3, 14, 1.624223602484472),
 (13, 3, 5, 1.643020594965675),
 (13, 3, 7, 1.643020594965675),
 (13, 3, 8, 1.643020594965675),
 (13, 3, 11, 1.643020594965675),
 (8, 2, 4, 1.6470588235294117),
 (1, 10, 0, 1.6643990929705215),
 (4, 10, 0, 1.6643990929705215),
 (6, 10, 0, 1.6643990929705215),
 (8, 10, 0, 1.6643990929705215),
 (4, 0, 6, 1.6679245283018869),
 (0, 2, 4, 1.6696832579185519),
 (13, 3, 3, 1.6787030213706706),
 (2, 10, 0, 1.6846011131725418),
 (13, 0, 6, 1.6888888888888889),
 (1, 2, 4, 1.6914539400665927),
 (2, 4, 4, 1.6923076923076923),
 (9, 6, 2, 1.6923076923076923),
 (13, 3, 1, 1.6956521739130435),
 (13, 3, 4, 1.6956521739130435),
 (13, 3, 9, 1.6956521739130435),
 (13, 3, 10, 1.6956521739130435),
 (12, 6, 11, 1.7040816326530612),
 (3, 10, 0, 1.7040816326530612)]

In [17]:
print('War')
print('Late Republic: ',LR_list[8],'\n')
print('The Principate: ',TP_list[9],'\n')
print('Late Antiquity: ',LA_list[4],'\n')

print('Religion')
print('Late Republic: ',LR_list[12],'\n')
print('The Principate: ',TP_list[2],'\n')
print('Late Antiquity: ',LA_list[12],'\n')

War
Late Republic:  (8, ['homo', 'istic', 'lex', 'modus', 'nego', 'rogo', 'nescio', 'soleo', 'opus', 'puto', 'animus', 'uerus', 'scribo', 'consuetudo', 'iudex', 'refero', 'unus', 'aufero', 'facile', 'magnus', 'uerres', 'fortuna', 'domus', 'socius', 'testis', 'molestus', 'commoueo', 'pecco', 'laudo', 'probus']) 

The Principate:  (9, ['cohors', 'legio', 'acies', 'uinco', 'signum', 'aurum', 'ualeo', 'arma', 'uirtus', 'pugna', 'eques', 'hostis', 'proelium', 'equus', 'castra', 'argentum', 'dux', 'uis', 'copia', 'miles', 'proximus', 'agmen', 'corpus', 'auxilium', 'caecinus', 'sequor', 'germanus', 'robur', 'ala', 'uir']) 

Late Antiquity:  (4, ['scribo', 'potior', 'liber', 'respondeo', 'mitto', 'peto', 'deus', 'littera', 'parum', 'puto', 'gratia', 'quantum', 'communis', 'frater', 'sententia', 'epistula', 'accipio', 'rogo', 'uir', 'dominus', 'tempus', 'facile', 'decet', 'animus', 'sumo', 'officium', 'opus', 'status', 'indico', 'beneficium']) 

Religion
Late Republic:  (12, ['locus', 'caesar',