# Transform treetagger tags to upos tags

In [1]:
import pandas as pd

In [47]:
# this is to show how the tags look after tagging with the treetagger
with open("TreeTagger_tokenized.txt","r") as infile:
    content = infile.readlines()
for i in range(3):
    print(content[i])

Augustinum	N:acc	Augustinus

vero	ADV	vero|verus

suam	POSS	suus



In [48]:
# manual for the upos tag: https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/Lamap-Tagset.pdf

In [49]:
# split each row that token, tags and lemma are separated
splitted = [content[i].split() for i in range(len(content))]
# one element has an additional column, we don't need it for this task so we remove this column
for i,element in enumerate(splitted):
    if len(element) >3:
        print(element)
        element
        splitted[i] = element[:3]

['Quartus', 'ADJ:NUM', 'quattuor', '-a']


In [50]:
# i create a dataframe with three columns
df = pd.DataFrame(splitted,columns = ["Token","Tags","Lemma"])
df.head()

Unnamed: 0,Token,Tags,Lemma
0,Augustinum,N:acc,Augustinus
1,vero,ADV,vero|verus
2,suam,POSS,suus
3,probasse,V:INF,probo
4,intelligentiam,N:acc,intelligentia


In [56]:
# now we need to transfrom the tags

map_tags = {
    # some of the tags do not have 1:1 corresponding tags and not all UPOS tags appear 
    # in this dictionary, for some i had to use my own defintions/intuition
    "ESSE:IND": "VERB",
    "ESSE:SUB": "VERB",
    "ESSE:INF": "VERB",
    "V:IND" : "VERB",
    "V:SUB": "VERB",
    "V:INF": "VERB",
    "V:GER": "VERB",
    "V:GED": "VERB",
    "V:PTC:nom": "VERB",
    "V:PTC:acc": "VERB",
    "V:PTC:abl": "VERB",
    "V:PTC": "VERB",
    "V:SUP:acc": "VERB",
    "V:SUP:abl": "VERB",
    "V:IMP": "VERB",
    "PRON":"PRON",
    "REL":"PRON",
    "POSS":"PRON",
    "DIMOS":"PRON",
    "INDEF":"PRON",
    "N:nom":"NOUN",
    "N:dat":"NOUN",
    "N:gen":"NOUN",
    "N:loc":"NOUN",
    "N:acc":"NOUN",
    "N:abl":"NOUN",
    "N:voc":"NOUN",
    "ADJ:NUM":"NUM",
    "CC":"CCONJ",
    "CS":"SCONJ",
    "NPR":"PROPN",
    "ADJ":"ADJ",
    "ADJ:COM":"ADJ",
    "ADJ:SUP":"ADJ",
    "ADJ:abl":"ADJ",
    "ADV":"ADV",
    "PREP":"ADP",
    "INT":"INTJ",
    "ABBR":"X",
    "EXCL":"INTJ",
    "FW":"X",
    "SENT":"PUNCT",
    "PUN":"PUNCT",
    "SYM":"SYM",
    "CLI":"CCONJ",
    "DET":"DET",
    "ENCL":"CCONJ"
}


# function to map the tags to upos and returns df
def mapping_pos_tags(df):
    """this function maps the tags form the 'pos' column of the df into upos tags"""
    df["UPOS"] = df["Tags"].map(map_tags)
    return df


# new dataframe that calls the function
mapped_upos_df =  mapping_pos_tags(df)
mapped_upos_df.head(10)

Unnamed: 0,Token,Tags,Lemma,UPOS
0,Augustinum,N:acc,Augustinus,NOUN
1,vero,ADV,vero|verus,ADV
2,suam,POSS,suus,PRON
3,probasse,V:INF,probo,VERB
4,intelligentiam,N:acc,intelligentia,NOUN
5,ex,PREP,ex,ADP
6,scripturis,N:abl,scriptura,NOUN
7,iuxta,PREP,iuxta|juxta,ADP
8,verbum,N:acc,verbum,NOUN
9,suum,POSS,suus,PRON
