In [6]:
# texts from here: https://github.com/PerseusDL/treebank_data/tree/master/v2.1/Latin/texts
# perseus: https://github.com/PerseusDL/treebank_data/tree/master/v2.1/Latin
# https://github.com/PerseusDL/treebank_data/blob/master/v2.1/Latin/texts/phi0631.phi001.perseus-lat1.tb.xml

In [46]:
import pandas as pd
import random
import xml.etree.ElementTree as et 

In [47]:
# i need to combine all dfs and get samle from them
filenames = ["tlg0031.tlg027.perseus-lat1.tb.xml","phi1351.phi005.perseus-lat1.tb.xml",
             "phi1348.abo012.perseus-lat1.tb.xml","phi1221.phi007.perseus-lat1.tb.xml",
             "phi0975.phi001.perseus-lat1.tb.xml","phi0972.phi001.perseus-lat1.xml",
             "phi0959.phi006.perseus-lat1.tb.xml","phi0690.phi003.perseus-lat1.tb.xml",
             "phi0631.phi001.perseus-lat1.tb.xml","phi0620.phi001.perseus-lat1.tb.xml",
             "phi0474.phi013.perseus-lat1.tb.xml","phi0448.phi001.perseus-lat1.tb.xml"]

# we have 12 files --> 300 sentences/12 files = 25 sents/file

In [50]:
# list for all the random sentences and tags from all files
random_sents = []
random.seed(4)
def extract_samples(filename):
    # all sents and tags from this file
    total_sents = []
    total_tags = []

    # parse the file and get root and body
    tree = et.parse(filename)
    root = tree.getroot()
    body = root.find("body")
    #print(body)
    for node in body: 
        # lists to store the words and their tags
        wordlist = []
        taglist = []
        # find the words
        words = node.findall("word")
        # get the needed attributes
        for word in words:
            wordlist.append(word.attrib.get("form"))
            taglist.append(word.attrib.get("postag"))
        # append the lists to the lists that contain the sentences/tags from the whole file
        total_sents.append(wordlist)
        total_tags.append(taglist)

    # Generate 25 unique random indices
    random_indices = random.sample(range(len(total_sents)), 25)

    print(random_indices)

    # Extract corresponding elements from both lists
    random_elements_sents = [total_sents[i] for i in random_indices]
    random_elements_tags = [total_tags[i] for i in random_indices]

    # safe as tuples
    for i,e in enumerate(random_elements_sents):
        for idx,word in enumerate(e):
            random_sents.append((word,random_elements_tags[i][idx]))


    print(random_sents[:10])

for file in filenames:
    extract_samples(file)
    

[241, 310, 105, 405, 490, 158, 92, 68, 20, 411, 562, 296, 60, 227, 532, 549, 368, 283, 176, 108, 268, 219, 26, 266, 278]
[('et', 'c--------'), ('cum', 'c--------'), ('locuta', 'v-prppnn-'), ('fuissent', 'v3plsa---'), ('septem', 'm--------'), ('tonitrua', 'n-p---nn-'), ('scripturus', 'v-sfpamn-'), ('eram', 'v1siia---'), (';', 'u--------'), ('et', 'c--------')]
[49, 42, 79, 74, 160, 187, 95, 22, 155, 86, 171, 99, 129, 63, 45, 183, 121, 71, 189, 140, 76, 1, 193, 146, 194]
[('et', 'c--------'), ('cum', 'c--------'), ('locuta', 'v-prppnn-'), ('fuissent', 'v3plsa---'), ('septem', 'm--------'), ('tonitrua', 'n-p---nn-'), ('scripturus', 'v-sfpamn-'), ('eram', 'v1siia---'), (';', 'u--------'), ('et', 'c--------')]
[260, 99, 211, 216, 306, 147, 220, 231, 82, 119, 156, 132, 22, 41, 23, 236, 320, 143, 265, 273, 331, 241, 175, 74, 344]
[('et', 'c--------'), ('cum', 'c--------'), ('locuta', 'v-prppnn-'), ('fuissent', 'v3plsa---'), ('septem', 'm--------'), ('tonitrua', 'n-p---nn-'), ('scripturus', 'v

In [53]:
df = pd.DataFrame(random_sents, columns = ["token","pos"])
df

Unnamed: 0,token,pos
0,et,c--------
1,cum,c--------
2,locuta,v-prppnn-
3,fuissent,v3plsa---
4,septem,m--------
...,...,...
5546,locum,n-s---ma-
5547,nostri,p-p---mn-
5548,castris,n-p---nd-
5549,delegerant,v3plia---


In [None]:
# now we need to map the pos tags to the upos tags

- n	noun
- v	verb
- a	adjective
- d	adverb
- c	conjunction
- r	adposition
- p	pronoun
- m	numeral
- i	interjection
- e	exclamation
- u	punctuation


In [54]:
# first part: part of speech tag

# no difference of SCONJ and CCONJ here
mapped={
    "n" : "NOUN",
    "v" : "VERB",
    "a" : "ADJ",
    "d" : "ADV",
    "c" : "CONJ",
    "r" : "ADP",
    "p" : "PRON",
    "m" : "NUM",
    "i" : "INTJ",
    "e" : "X",
    "u" : "PUNCT",
    "None": "None",
    "-" : "None"
}

def extract_first_letter(pos):
    """This function will return the first letter, that means in that case the pos tag"""
    if pos != None and not(len(pos)==0):
        return pos[0]
    else:
        return "None"


def map_postags(df):
    df["first_pos"] = df["pos"].apply(extract_first_letter)
    df["upos"] = df["first_pos"].map(mapped)
    return df
        
def clean_none(df):
    df = df[df['upos'] != 'None']
    return df


mapped_upos_df =  clean_none(map_postags(df))
print(mapped_upos_df.head(10))

        token        pos first_pos   upos
0          et  c--------         c   CONJ
1         cum  c--------         c   CONJ
2      locuta  v-prppnn-         v   VERB
3    fuissent  v3plsa---         v   VERB
4      septem  m--------         m    NUM
5    tonitrua  n-p---nn-         n   NOUN
6  scripturus  v-sfpamn-         v   VERB
7        eram  v1siia---         v   VERB
8           ;  u--------         u  PUNCT
9          et  c--------         c   CONJ


In [55]:
mapped_upos_df

Unnamed: 0,token,pos,first_pos,upos
0,et,c--------,c,CONJ
1,cum,c--------,c,CONJ
2,locuta,v-prppnn-,v,VERB
3,fuissent,v3plsa---,v,VERB
4,septem,m--------,m,NUM
...,...,...,...,...
5546,locum,n-s---ma-,n,NOUN
5547,nostri,p-p---mn-,p,PRON
5548,castris,n-p---nd-,n,NOUN
5549,delegerant,v3plia---,v,VERB


In [57]:
with open("data/perseus-random-300.txt","w") as outfile:
    for index,row in mapped_upos_df.iterrows():
        out = str(row["token"])+"\t"+row["upos"]+"\n"
        outfile.write(out)
    