In [4]:
import numpy as np
from sklearn.manifold import TSNE
import seaborn as sns 
import matplotlib.pyplot as plt
import pandas as pd
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.corpora import Dictionary
import ast
import json

# Opening the Dataframe 

In [5]:
df= pd.read_csv('final_corpus.csv',converters={'tokenized_lemmas': ast.literal_eval,'tokenized_norm':ast.literal_eval})
#removing all one word items in the dataset
mask = [len(sent) > 2 for sent in df['tokenized_lemmas']]
df = df[mask]

#removing unwanted col 
df = df.drop(['Unnamed: 0.1','Unnamed: 0'],axis=1)
df.head(1)

Unnamed: 0,period,author,title,text,tokenized_text,lemma,tokenized_lemmas
0,Late_Antiquity,Agennius Ubricus,De Controuersiis Agrorum,aduersantur ne quid in rerum natura finitum ...,"['aduersantur', 'ne', 'quid', 'in', 'rerum', '...",aduersor ne quis in res natura finio sum uideo .,"[aduersor, ne, quis, in, res, natura, finio, s..."


In [6]:
with open('stopwords_latin_lemmas.json','r',encoding='utf-8') as f: 
    stopwords = json.load(f)  

print(f'Pre-cleaned: {df['tokenized_lemmas'][0]}')

for sents in df['tokenized_lemmas']: 
    for i in range(len(sents)-1,-1,-1): 
        if sents[i] in stopwords:
            sents.remove(sents[i])
print(f'Post-cleaned: {df['tokenized_lemmas'][0]}') 

Pre-cleaned: ['aduersor', 'ne', 'quis', 'in', 'res', 'natura', 'finio', 'sum', 'uideo', '.']
Post-cleaned: ['aduersor', 'natura', 'finio']


# Organizing the Data

In [7]:
#check distribution of data based on labels
print("Distribution of items in dataset based on period labels:\n",df['period'].value_counts())

Distribution of items in dataset based on period labels:
 period
The_Principate         136053
Late_Republican_Era    108000
Late_Antiquity          63699
Name: count, dtype: int64


In [8]:
#seperates the dataset into seperate periods
LR = df.loc[df['period']=='Late_Republican_Era']
TP = df.loc[df['period']=='The_Principate']
LA = df.loc[df['period']=='Late_Antiquity']

# Training the Word2Vec model and Getting Vocabularies

In [9]:
# training the models
vector_size = 300
window = 5
min_count = 50
epochs = 50

LR_model = Word2Vec(sentences=list(LR['tokenized_lemmas']), vector_size=vector_size, window=window, min_count=min_count, epochs= epochs)
print('LR complete')

TP_model = Word2Vec(sentences=list(TP['tokenized_lemmas']), vector_size=vector_size, window=window, min_count=min_count, epochs= epochs)
print('TP complete')

LA_model = Word2Vec(sentences=list(LA['tokenized_lemmas']), vector_size=vector_size, window=window, min_count=min_count, epochs= epochs)
print('LA complete')

LR complete
TP complete
LA complete


In [10]:
#save the models
#LR_model.save('Word2Vec models/LR_W2V_model')
#TP_model.save('Word2Vec models/TP_W2V_model')
#LA_model.save('Word2Vec models/LA_W2V_model')

#load models
LR_model = Word2Vec.load('Word2Vec models/LR_W2V_model') 
TP_model = Word2Vec.load('Word2Vec models/TP_W2V_model') 
LA_model = Word2Vec.load('Word2Vec models/LA_W2V_model')

In [11]:
# Creating vocabulary
LR_vocab = set(LR_model.wv.key_to_index.keys())
TP_vocab = set(TP_model.wv.key_to_index.keys())
LA_vocab = set(LA_model.wv.key_to_index.keys())

In [12]:
#Example of wordlist

#target word
word = 'religio'

#10 most similar words to target in each period
LR_word = LR_model.wv.most_similar(word, topn=10)  # get other similar words
TP_word = TP_model.wv.most_similar(word, topn=10)  # get other similar words
LA_word = LA_model.wv.most_similar(word, topn=10)  # get other similar words

print(sorted((list(zip(*LR_word))[0])))
print(sorted((list(zip(*TP_word))[0])))
print(sorted((list(zip(*LA_word))[0])))

['amplitudo', 'ciuis', 'ciuitas', 'deus', 'homo', 'iudicandus', 'iudicium', 'nefarius', 'pristinus', 'scelus']
['benignitas', 'caerimonia', 'deus', 'diuinitas', 'iustitia', 'numen', 'religiosus', 'sacerdos', 'sanctissimus', 'ueritas']
['christianus', 'ecclesia', 'ethnicus', 'fides', 'initio', 'professio', 'religiosus', 'sacrilegus', 'sacrum', 'superstitio']


# Jaccard Similarity

In [13]:
#Jaccard Similarity function that compares the lists of words and returns a dissimilarity score

def j_sim(word1,word2):
  wordlist1 = (list(zip(*word1))[0])
  wordlist2 = (list(zip(*word2))[0])
  intersection = set(wordlist1).intersection(set(wordlist2))
  union = set(wordlist1).union(set(wordlist2))
  sim = (len(intersection) / len(union))
  return sim 

# Comparasions 

In [14]:
# jaccard sim from models
def compare(x,model1,model2): 
    w1 = model1.wv.most_similar(x, topn=100)
    w2 = model2.wv.most_similar(x, topn=100)
    distance = j_sim(w1,w2)
    nn_1 = list(zip(*w1))[0]
    nn_2 = list(zip(*w2))[0]
    nn_intersection = set(nn_1).intersection(nn_2)
    return (x, distance, nn_intersection, nn_1, nn_2)

In [15]:
#generates a dataframe in which every word in the shared vocabulary gets a Jaccard Sim Score 
#this demostrates how much each word usage has changed.
#the dataframe also provides the top 10 most similar words

def create_comparison_df(vocab1, vocab2, model1, model2): 
    comparison_info = []
    shared_vocab = list(vocab1.intersection(vocab2))
  
    for x in shared_vocab: 
        info = compare(x,model1,model2)
        comparison_info.append(info)

    return comparison_info


#### Late Republican and The Principate

In [16]:
data = list(sorted(create_comparison_df(LR_vocab,TP_vocab,LR_model,TP_model)))
LR_TP_comp = pd.DataFrame(data, columns =['word','jaccard_sim',"shared_set",'LR_set','TP_set'])
LR_TP_comp.sort_values(by='jaccard_sim', ascending = False).head(10)

Unnamed: 0,word,jaccard_sim,shared_set,LR_set,TP_set
85,aequor,0.282051,"{gurges, carina, aether, aura, amnis, fretum, ...","(unda, litus, uelum, aether, puppis, gurges, f...","(gurges, fluctus, aether, unda, ratis, boreas,..."
2522,unda,0.25,"{gurges, carina, aether, aura, amnis, fretum, ...","(aequor, uastus, altus, amnis, aruum, tellus, ...","(gurges, boreas, aequor, uadum, fluctus, ratis..."
239,auster,0.242236,"{gurges, aether, aestas, aura, fretum, nubes, ...","(aquilo, nubilum, nubes, imber, nimbus, uentus...","(aquilo, eurus, boreas, uentus, nubilum, fretu..."
1791,pontus,0.242236,"{gurges, aether, amnis, fretum, stagnum, tango...","(tellus, fluctus, unda, aegyptus, auster, aqui...","(mare, fretum, aequor, thracius, unda, boreas,..."
1459,mons,0.242236,"{lacus, lucus, aether, amnis, stagnum, fretum,...","(uallis, silua, lucus, flumen, ripa, saxum, am...","(collis, amnis, uertex, alpis, scopulus, campu..."
1980,ratis,0.234568,"{gurges, carina, aura, fretum, stagnum, portus...","(aequor, puppis, unda, gurges, rapidus, uentus...","(carina, fluctus, puppis, aequor, remus, unda,..."
2338,terra,0.234568,"{aether, herba, ignis, amnis, nubes, aequor, s...","(tellus, regio, aestus, aruum, mundus, sol, po...","(tellus, sidus, mundus, auster, astrum, caelum..."
1562,nubilum,0.234568,"{aether, aura, nubes, aequor, umbra, umor, ful...","(nimbus, nubes, auster, aquilo, cauus, imber, ...","(nubes, polus, nimbus, radius, stella, caligo,..."
2317,telum,0.226994,"{arcus, sagitta, hostis, umerus, cornu, moenia...","(hasta, sagitta, turnus, ferrum, clipeus, ensi...","(spiculum, hasta, sagitta, ferrum, ensis, cusp..."
893,fluctus,0.219512,"{gurges, carina, aether, aura, fretum, saxum, ...","(aequor, unda, gurges, uentus, pontus, uelum, ...","(ratis, aequor, unda, pelagus, fretum, procell..."


#### The Principate and Late Antiquity

In [17]:
data = list(sorted(create_comparison_df(TP_vocab,LA_vocab,TP_model,LA_model)))
TP_LA_comp = pd.DataFrame(data, columns =['word','jaccard_sim',"shared_set",'TP_set','LA_set'])
TP_LA_comp.sort_values(by='jaccard_sim', ascending = False).head(10)

Unnamed: 0,word,jaccard_sim,shared_set,TP_set,LA_set
887,fretum,0.257862,"{gurges, carina, amnis, stagnum, portus, aequo...","(mare, pontus, aequor, pelagus, litus, gurges,...","(carina, puppis, pelagus, aequor, alpes, stagn..."
1512,nubes,0.242236,"{gurges, meatus, aether, aura, uolito, saxum, ...","(nubilum, nimbus, aether, aequor, imber, polus...","(nubilum, ros, meatus, fumo, imber, ala, uerte..."
1318,mare,0.242236,"{gurges, lacus, carina, amnis, fretum, stagnum...","(pontus, fretum, pelagus, litus, syrtis, aequo...","(pontus, classis, uolatilis, litus, meo, pelag..."
59,aequor,0.234568,"{gurges, carina, amnis, stagnum, fretum, portu...","(gurges, fluctus, aether, unda, ratis, boreas,...","(pelagus, stagnum, carina, fretum, unda, litus..."
79,agmen,0.234568,"{rhenus, castra, hostis, moenia, telum, nubes,...","(arma, turma, uallum, castra, campus, alpes, l...","(turma, stipo, eques, caterua, cohors, castra,..."
1265,litus,0.226994,"{gurges, lacus, amnis, stagnum, fretum, portus...","(mare, fretum, aequor, syrtis, gurges, classis...","(oceanus, classis, insula, europa, libycus, ca..."
213,auster,0.226994,"{carina, meatus, aestas, aura, fretum, stagnum...","(aquilo, eurus, boreas, uentus, nubilum, fretu...","(libycus, asia, aestas, uentus, nimbus, axis, ..."
2430,uentus,0.219512,"{gurges, meatus, aether, aura, stagnum, fretum...","(flatus, auster, eurus, procella, ratis, aquil...","(procella, aura, unda, auster, nubilum, puppis..."
2303,telum,0.219512,"{monstrum, campus, arcus, sagitta, cornu, spic...","(spiculum, hasta, sagitta, ferrum, ensis, cusp...","(hasta, clipeus, iaculus, ensis, mucro, ile, t..."
963,gurges,0.219512,"{lacus, amnis, stagnum, fretum, aequor, litus,...","(unda, aequor, fretum, uadum, litus, amnis, fo...","(amnis, nilus, tigris, stagnum, lacus, fumo, n..."


#### Final Comparison Dataset

In [18]:
#creating the final comparasion dataset
final_comp = pd.merge(LR_TP_comp, TP_LA_comp, on='word', how ='inner').drop(labels=['TP_set_x'],axis=1)

#removing excess columns and renaming columns for clarity
final_comp = final_comp.rename(columns={'jaccard_sim_x':'LR_TP_sim',
                           'shared_set_x':'LR_TP_shared_set',
                           'TP_set_y':'TP_set',
                           'jaccard_sim_y':'TP_LA_sim',
                           'shared_set_y':'TP_LA_shared_set'})
final_comp = final_comp[['word','LR_set','LR_TP_sim','LR_TP_shared_set',
                         'TP_set','TP_LA_sim', 'TP_LA_shared_set','LA_set']]

In [20]:
#importing the domains of interest 
f = open('Standard Vocab testing list','r')
domain_vocab_list = ast.literal_eval(f.read())
civics_list = domain_vocab_list[0]
war_list = domain_vocab_list[1]

print("war: ",civics_list)
print("civics/relgion: ",war_list)

#getting the df entries for domains of interest
war = final_comp.loc[final_comp['word'].isin(civics_list)].sort_values(by='LR_TP_sim', ascending=False)
civics = final_comp.loc[final_comp['word'].isin(war_list)].sort_values(by='LR_TP_sim', ascending=False)

war:  ['gens', 'amo', 'duo', 'uirtus', 'imperator', 'timeo', 'urbs', 'oppidum', 'grauis', 'rex', 'iubeo', 'ira', 'praesidium', 'paro', 'hostis', 'bellum', 'prouincia', 'reliquus', 'socius', 'mens', 'cohors', 'mitto', 'acies', 'miles', 'caesar', 'magnus', 'arma', 'manus', 'rapio', 'princeps', 'pompeius', 'romanus', 'locus', 'consilium', 'iter', 'nauis', 'proelium', 'cognosco', 'legatus', 'uinco', 'uotum', 'signum', 'proficiscor', 'uirgo', 'auxilium', 'pax', 'fortis', 'pugno', 'legio', 'ciuilis', 'populus', 'uictor', 'uir', 'relinquo', 'agmen', 'castra', 'imperium', 'eques', 'paucus', 'uis', 'tempus', 'copia', 'armo', 'dux', 'dies', 'capio', 'exercitus']
civics/relgion:  ['deus', 'pater', 'permitto', 'uirtus', 'ciuis', 'ius', 'paulinus', 'augustinus', 'praetor', 'bonus', 'modus', 'iubeo', 'rogo', 'gero', 'christus', 'epistola', 'filius', 'liber', 'epistula', 'consul', 'bellum', 'uxor', 'hereditas', 'mitto', 'magnus', 'pietas', 'uenerabilis', 'seruus', 'princeps', 'sidonius', 'romanus', '

In [21]:
#main domain of study
civics.head(5)

Unnamed: 0,word,LR_set,LR_TP_sim,LR_TP_shared_set,TP_set,TP_LA_sim,TP_LA_shared_set,LA_set
1447,publicus,"(publica, ciuitas, senatus, communis, magistra...",0.183432,"{legatio, pecunia, praetor, decerno, plebs, iu...","(publica, princeps, iudicium, honestus, inimic...",0.081081,"{iudicium, honor, publica, priuatus, ius, cons...","(municipium, prouincia, iniuria, priuo, romanu..."
1625,senatus,"(legatus, consul, publicus, aerarium, decerno,...",0.183432,"{metellus, tribunus, legatio, quaestor, senten...","(magistratus, princeps, consul, postulo, edict...",0.041667,"{princeps, publicus, consul, forum, romanus, c...","(curia, tribunal, praefectus, pontifex, procer..."
1280,pecunia,"(hs, ciuitas, nummus, improbissimus, nomen, ob...",0.162791,"{emo, aerarium, praetor, lex, lucrum, obses, f...","(sestertium, uectigalis, creditor, fiscus, lar...",0.104972,"{emo, pretium, possideo, debitum, lucrum, merc...","(thesaurus, auarus, beneficium, pretium, commo..."
341,consul,"(censor, senatus, ianuarius, consularis, consu...",0.162791,"{legatio, sulla, marius, quaestor, necessitudo...","(consulatus, senatus, ualerius, consularis, de...",0.052632,"{annus, legatus, princeps, senatus, caesar, pr...","(curulis, fastus, procerus, auus, fascis, trib..."
1909,uirtus,"(laus, prudentia, amplitudo, fortus, scientia,...",0.149425,"{studium, eloquentia, uis, superbia, egregius,...","(animus, ars, gloria, bonus, fortitudo, laus, ...",0.136364,"{studium, summum, uis, scientia, potentia, inu...","(fortitudo, deus, fides, deuotio, justitia, im..."


In [22]:
#Case study 1
religion_terms = ['deus','dominus']
religion = final_comp.loc[final_comp['word'].isin(religion_terms)]
religion

Unnamed: 0,word,LR_set,LR_TP_sim,LR_TP_shared_set,TP_set,TP_LA_sim,TP_LA_shared_set,LA_set
451,deus,"(dea, numen, divos, rite, caelestis, iunona, s...",0.136364,"{pius, genitor, da, dea, iuppiter, sanctus, sa...","(superus, numen, diuino, iouus, diuinitas, uen...",0.06383,"{christus, aeternus, diuinus, diuinitas, hosti...","(dominus, christus, homo, filius, ueritas, diu..."
500,dominus,"(fundus, emptor, lar, nummus, annuus, aratio, ...",0.069519,"{possessio, colonus, seruus, emptor, praedium,...","(traianum, contentus, hereditas, rogo, libertu...",0.036269,"{deus, seruus, libero, beatus, pauper, nolo, h...","(deus, christus, sanctus, filius, gratia, bonu..."


In [23]:
#Case study 2
civis = final_comp.loc[final_comp['word']=='ciuis']
civis

Unnamed: 0,word,LR_set,LR_TP_sim,LR_TP_shared_set,TP_set,TP_LA_sim,TP_LA_shared_set,LA_set
249,ciuis,"(populus, homo, ciuitas, publicus, socius, uir...",0.086957,"{eques, publicus, amicus, populus, ius, condic...","(aelius, sentia, populus, ius, tullius, pater,...",0.058201,"{hospes, princeps, amicus, ius, familia, nobil...","(imperator, peregrinor, rus, urbs, frater, mat..."


In [24]:
#looking for patterns in highest and lowest scoring words 
x = final_comp.sort_values(by='LR_TP_sim',ascending=False).head(10)
y = final_comp.sort_values(by='TP_LA_sim',ascending=False).head(10)
highest_scores = pd.concat([x, y])

a = final_comp.sort_values(by='LR_TP_sim',ascending=False).tail(10)
b = final_comp.sort_values(by='TP_LA_sim',ascending=False).tail(10)
lowest_scores = pd.concat([a, b])

In [25]:
#word with highest similarity across all periods
highest = final_comp.loc[final_comp['word']=='aequor'] 
highest

Unnamed: 0,word,LR_set,LR_TP_sim,LR_TP_shared_set,TP_set,TP_LA_sim,TP_LA_shared_set,LA_set
46,aequor,"(unda, litus, uelum, aether, puppis, gurges, f...",0.282051,"{gurges, carina, aether, aura, amnis, fretum, ...","(gurges, fluctus, aether, unda, ratis, boreas,...",0.234568,"{gurges, carina, amnis, stagnum, fretum, portu...","(pelagus, stagnum, carina, fretum, unda, litus..."


In [26]:
#word with lowest score across all periods
lowest = final_comp.loc[final_comp['word']=='nusquam'] 
lowest

Unnamed: 0,word,LR_set,LR_TP_sim,LR_TP_shared_set,TP_set,TP_LA_sim,TP_LA_shared_set,LA_set
1178,nusquam,"(usquam, lucullus, clamo, imperatum, latinus, ...",0.0,{},"(syrtis, porticus, dies, longissimus, patulus,...",0.0,{},"(eruo, siccus, conor, grauior, exspectatio, sa..."
