In [1]:
# dok: proiel: http://dev.syntacticus.org/development-guide/#alignments
#with open("proiel-treebank-20180408/latin-nt.conll","r") as  lat_file:
#    content = lat_file.readlines()
#    for line in content:
#        print(line.split("	"))

In [2]:
import pandas as pd
from conll_df import conll_df
import random

In [3]:
path = "proiel-treebank-20180408/latin-nt.conll"
df = conll_df(path, file_index=False)
df = df.drop(["p","g","f"], axis=1)
df.rename(columns={"w": "token"}, inplace=True)
df.rename(columns={"l": "lemma"}, inplace=True)
df.rename(columns={"n": "part-of-speech"}, inplace=True)

# we are left with a df with 3 columns: token, lemma and pos tag that are not yet mapped to the upos tags
# s means sentence number, i means token number in sentence
df.head(45)

Unnamed: 0_level_0,Unnamed: 1_level_0,token,lemma,part-of-speech
s,i,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,liber,liber,Nb
1,2,generationis,generatio,Nb
1,3,Iesu,Iesus,Ne
1,4,Christi,Christus,Ne
1,5,filii,filius,Nb
1,6,David,David,Ne
1,7,filii,filius,Nb
1,8,Abraham,Abraham,Ne
2,1,Abraham,Abraham,Ne
2,2,genuit,gigno,V-


In [4]:
# i now want to reset the index columns and add them as normal columns
df_reset = df.reset_index()
df_reset.head(10)

Unnamed: 0,s,i,token,lemma,part-of-speech
0,1,1,liber,liber,Nb
1,1,2,generationis,generatio,Nb
2,1,3,Iesu,Iesus,Ne
3,1,4,Christi,Christus,Ne
4,1,5,filii,filius,Nb
5,1,6,David,David,Ne
6,1,7,filii,filius,Nb
7,1,8,Abraham,Abraham,Ne
8,2,1,Abraham,Abraham,Ne
9,2,2,genuit,gigno,V-


In [18]:
# some of the tags do not have 1:1 corresponding tags
mapped_tags = {
    "A-" : "ADJ",
    "C-" : "CCONJ",
    "Df" : "ADV",
    "Dq" : "ADV",
    "Du" : "ADV",
    "F" : "X",
    "F-" : "X",
    "G-" : "SCONJ",
    "I-" : "INTJ",
    "Ma" : "NUM",
    "Mo" : "NUM",
    "N-" : "VERB",
    "Nb" : "NOUN",
    "Ne" : "PROPN",
    "Pc" : "PRON",
    "Pd" : "PRON",
    "Pi" : "PRON",
    "Pk" : "PRON",
    "Pp" : "PRON",
    "Pr" : "PRON",
    "Ps" : "PRON",
    "Pt" : "PRON",
    "Px" : "PRON",
    "Py" : "PRON",
    "R-" : "ADP",
    "S-" : "DET",
    "V-" : "VERB",
    "X-" : "X"    
}
def mapping_pos_tags(df):
    df["UPOS"] = df["part-of-speech"].map(mapped_tags)
    return df


mapped_upos_df =  mapping_pos_tags(df_reset)
print(mapped_upos_df.head(10))

   s  i         token      lemma part-of-speech   UPOS
0  1  1         liber      liber             Nb   NOUN
1  1  2  generationis  generatio             Nb   NOUN
2  1  3          Iesu      Iesus             Ne  PROPN
3  1  4       Christi   Christus             Ne  PROPN
4  1  5         filii     filius             Nb   NOUN
5  1  6         David      David             Ne  PROPN
6  1  7         filii     filius             Nb   NOUN
7  1  8       Abraham    Abraham             Ne  PROPN
8  2  1       Abraham    Abraham             Ne  PROPN
9  2  2        genuit      gigno             V-   VERB


In [19]:
# the mapped_upos_df contains 11851 sentences 
# i want to extract 300 random sentences from this

random.seed(4)

# Generate 300 different random numbers and order them
random_sentences = sorted(random.sample(range(1, 11852), 300))
print(random_sentences)

# i now want to extract these sentence elements from the df
random_sent_df = mapped_upos_df[mapped_upos_df["s"].isin(random_sentences)]

random_sent_df

[54, 119, 154, 173, 215, 298, 325, 421, 433, 519, 561, 583, 604, 605, 664, 670, 678, 702, 709, 730, 734, 759, 841, 875, 906, 965, 973, 991, 1010, 1020, 1089, 1090, 1219, 1225, 1274, 1295, 1306, 1329, 1354, 1395, 1421, 1464, 1477, 1523, 1591, 1655, 1691, 1740, 1851, 1862, 1899, 1915, 2002, 2004, 2092, 2236, 2274, 2322, 2377, 2452, 2506, 2540, 2569, 2604, 2644, 2653, 2701, 2711, 2741, 2764, 2779, 2829, 2909, 2913, 3010, 3076, 3134, 3137, 3170, 3197, 3205, 3256, 3282, 3305, 3306, 3321, 3365, 3396, 3407, 3409, 3456, 3461, 3513, 3527, 3535, 3581, 3590, 3637, 3643, 3750, 3793, 3822, 3868, 3890, 3963, 3972, 4053, 4073, 4078, 4081, 4097, 4152, 4168, 4210, 4217, 4255, 4265, 4288, 4289, 4396, 4415, 4453, 4527, 4534, 4547, 4588, 4589, 4595, 4681, 4722, 4735, 4742, 4746, 4757, 4767, 4784, 4785, 4799, 4808, 4854, 4866, 4920, 4970, 4999, 5077, 5099, 5108, 5154, 5162, 5254, 5267, 5271, 5289, 5301, 5326, 5360, 5418, 5425, 5429, 5515, 5529, 5547, 5618, 5684, 5830, 5851, 5903, 5941, 5943, 5953, 5984, 61

Unnamed: 0,s,i,token,lemma,part-of-speech,UPOS
329,54,1,ecce,ecce,I-,INTJ
330,54,2,virgo,virgo,Nb,NOUN
331,54,3,in,in,R-,ADP
332,54,4,utero,uterus,Nb,NOUN
333,54,5,habebit,habeo,V-,VERB
...,...,...,...,...,...,...
109984,11754,2,dicit,dico,V-,VERB
110515,11817,1,et,et,C-,CCONJ
110516,11817,2,ecce,ecce,I-,INTJ
110517,11817,3,venio,venio,V-,VERB


In [21]:
with open("data/proiel-random-300.txt","w") as outfile:
    for index,row in random_sent_df.iterrows():
        out = row["token"]+"\t"+row["UPOS"]+"\n"
        outfile.write(out)
    