# Assignment #4: Extraction of subject-verb-object triples
## CoNLL-X reader

In [1]:
import os


def read_sentences(file):
    """
    Creates a list of sentences from the corpus
    Each sentence is a string
    :param file:
    :return:
    """
    f = open(file).read().strip()
    sentences = f.split('\n\n')
    return sentences


def split_rows(sentences, column_names):
    """
    Creates a list of sentence where each sentence is a list of lines
    Each line is a dictionary of columns
    :param sentences:
    :param column_names:
    :return:
    """
    new_sentences = []
    root_values = ['0', 'ROOT', 'ROOT', 'ROOT', 'ROOT', 'ROOT', '0', 'ROOT', '0', 'ROOT']
    start = [dict(zip(column_names, root_values))]
    for sentence in sentences:
        rows = sentence.split('\n')
        sentence = [dict(zip(column_names, row.split())) for row in rows if row[0] != '#']
        sentence = start + sentence
        new_sentences.append(sentence)
    return new_sentences


if __name__ == '__main__':
    column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']

    train_file = './swedish_talbanken05_train.conll'
    # train_file = 'test_x'
    test_file = './swedish_talbanken05_test.conll'

    sentences = read_sentences(train_file)
    formatted_corpus = split_rows(sentences, column_names_2006)


## Subject-verb extractor from corpus

In [2]:
def extract_subject_verb_from(formatted_corpus):
    pairs = dict()
    for sentence in formatted_corpus:
        for word in sentence:
            if word['deprel'] == "SS":
                pair = (word['form'].lower(), sentence[int(word['head'])]['form'].lower())
                if pair in pairs.keys():
                    pairs[pair] += 1
                else:
                    pairs[pair] = 1
    return pairs

def get_number_subject_verb(pairs):
    counter = 0
    for pair in pairs:
        counter += pairs[pair]
    return str(counter)

if __name__ == '__main__':
    pairs = extract_subject_verb_from(formatted_corpus)
    
    print("Total number of pairs: " + get_number_subject_verb(pairs))
    pairs_sorted_values = sorted(pairs.values(), reverse=True)
    pairs_sorted_keys = sorted(pairs, key=pairs.get, reverse=True)
    for i in range(5):
        print('(' + pairs_sorted_keys[i][0] + ',' + pairs_sorted_keys[i][1] + ')' + ' --> ' + str(pairs_sorted_values[i]))


Total number of pairs: 18885
(det,är) --> 537
(man,kan) --> 261
(som,är) --> 211
(jag,tror) --> 171
(äktenskapet,är) --> 161


## Subject-verb-object extractor from corpus

In [3]:
def extract_subject_verb_object_from(formatted_corpus):
    triples = dict()
    for sentence in formatted_corpus:
        for obj_word in sentence:
            if obj_word['deprel'] == "OO":
                for subj_word in sentence:
                    if subj_word['head'] == obj_word['head'] and subj_word['deprel'] == "SS": 
                        triple = (subj_word['form'].lower(), sentence[int(subj_word['head'])]['form'].lower(), obj_word['form'].lower())
                        if triple in triples.keys():
                            triples[triple] += 1
                        else:
                            triples[triple] = 1
    return triples

def get_number_subject_verb_object(triples):
    counter = 0
    for triple in triples:
        counter += triples[triple]
    return str(counter)

if __name__ == '__main__':
    triples = extract_subject_verb_object_from(formatted_corpus)
    
    print("Total number of triples: " + get_number_subject_verb_object(triples))
    triples_sorted_values = sorted(triples.values(), reverse=True)
    triples_sorted_keys = sorted(triples, key=triples.get, reverse=True)
    for i in range(5):
        print('(' + triples_sorted_keys[i][0] + ',' + triples_sorted_keys[i][1] + ',' + triples_sorted_keys[i][2] + ')' + ' --> ' + str(triples_sorted_values[i]))


Total number of triples: 5844
(man,gifter,sig) --> 37
(jag,tror,är) --> 36
(jag,tycker,är) --> 36
(man,vänder,sig) --> 19
(som,ingått,äktenskap) --> 17


In [4]:
def get_files(dir, suffix):
    """
    Returns all the files in a folder ending with suffix
    Recursive version
    :param dir:
    :param suffix:
    :return: the list of file names
    """
    files = []
    for file in os.listdir(dir):
        path = dir + '/' + file
        if os.path.isdir(path):
            files += get_files(path, suffix)
        elif os.path.isfile(path) and file.endswith(suffix):
            files.append(path)
    return files

'''def split_rows(sentences, column_names):
    """
    Creates a list of sentence where each sentence is a list of lines
    Each line is a dictionary of columns
    :param sentences:
    :param column_names:
    :return:
    """
    new_sentences = []
    root_values = ['0', 'ROOT', 'ROOT', 'ROOT', 'ROOT', 'ROOT', '0', 'ROOT', '0', 'ROOT']
    start = dict()
    start['0'] = dict(zip(column_names, root_values)
    for sentence in sentences:
        rows = sentence.split('\n')
        sentence = [dict(zip(column_names, row.split())) for row in rows if row[0] != '#']
        sentence = start + sentence
        new_sentences.append(sentence)
    return new_sentences'''

if __name__ == '__main__':
    
    column_names_u = ['id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc']

    files = get_files('./', 'train.conllu')
    files_corpus = dict()
    for train_file in files:
        sentences = read_sentences(train_file)
        formatted_corpus = split_rows(sentences, column_names_u)
        dict_formatted_corpus = list()
        for formatted_sentence in formatted_corpus:
            ids = list()
            for formatted_word in formatted_sentence:
                ids.append(formatted_word['id'])
            dict_formatted_corpus.append(dict(zip(ids,formatted_sentence)))
        files_corpus[train_file] = dict_formatted_corpus
        print(train_file)
        


.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Ancient_Greek/grc-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Arabic/ar-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Basque/eu-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Bulgarian/bg-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Catalan/ca-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Chinese/zh-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Croatian/hr-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Czech/cs-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Czech-CAC/cs_cac-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Czech-CLTT/cs_cltt-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-con

In [5]:
def extract_subject_verb_from(files_corpus):
    pairs = dict()
    for file in files_corpus.keys():
        print(file)
        pairs[file] = dict()
        for sentence in files_corpus[file]:
            for word in sentence.keys():
                if sentence[word]['deprel'] == "nsubj":
                    pair = (sentence[word]['form'].lower(), sentence[sentence[word]['head']]['form'].lower())
                    if pair in pairs[file]:
                        pairs[file][pair] += 1
                    else:
                        pairs[file][pair] = 1
    return pairs

def get_number_subject_verb(pairs):
    counter = dict()
    for file in pairs.keys():
        file_counter = 0
        for pair in pairs[file]:
            file_counter += pairs[file][pair]
        counter[file] = file_counter
    return counter

if __name__ == '__main__':
    pairs = extract_subject_verb_from(files_corpus)
    pairs_counter = get_number_subject_verb(pairs)
    for file in pairs.keys():
        print("[File] " + file + ": \n[Total] " + str(pairs_counter[file]))
        pairs_sorted_values = sorted(pairs[file].values(), reverse=True)
        pairs_sorted_keys = sorted(pairs[file], key=pairs[file].get, reverse=True)
        for i in range(5):
            print('(' + pairs_sorted_keys[i][0] + ',' + pairs_sorted_keys[i][1] + ')' + ' --> ' + str(pairs_sorted_values[i]))

.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Ancient_Greek/grc-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Arabic/ar-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Basque/eu-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Bulgarian/bg-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Catalan/ca-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Chinese/zh-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Croatian/hr-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Czech/cs-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Czech-CAC/cs_cac-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Czech-CLTT/cs_cltt-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-con

(che,divenne) --> 3
(balzac,ebbe) --> 3
[File] .//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Japanese/ja-ud-train.conllu: 
[Total] 6535
(可能性,ある) --> 8
(必要,ある) --> 7
(注意,必要) --> 6
(もの,多い) --> 6
(もの,ある) --> 6
[File] .//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Kazakh/kk-ud-train.conllu: 
[Total] 49
(шолпан,айтушы) --> 1
(қызығы,жоқ) --> 1
(жаны,сүймеген) --> 1
(өзінің,сүйгеніне) --> 1
(шолпан,қылмады) --> 1
[File] .//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Korean/ko-ud-train.conllu: 
[Total] 5404
(수,있는) --> 60
(수,있다) --> 42
(거,같아요) --> 14
(수,있도록) --> 14
(수,있을) --> 11
[File] .//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Latin/la-ud-train.conllu: 
[Total] 1469
(qui,habet) --> 5
(spiritus,dicat) --> 5
(senatus,decrevit) --> 3
(qui,vicerit) --> 3
(ego,vereor) --> 3
[File] .//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Latin-ITTB/la_ittb-ud-train.conllu: 
[Total] 17436
(quod,est) --> 278
(quae,sunt) --> 235
(quae,est) --> 176
(deus,cognosci

In [6]:
def extract_subject_verb_object_from(files_corpus):
    triples = dict()
    for file in files_corpus.keys():
        print(file)
        triples[file] = dict()
        for sentence in files_corpus[file]:
            for obj_word in sentence.keys():
                if sentence[obj_word]['deprel'] == "obj":
                    for subj_word in sentence.keys():
                        if sentence[subj_word]['head'] == sentence[obj_word]['head'] and sentence[subj_word]['deprel'] == "nsubj": 
                            triple = (sentence[subj_word]['form'].lower(), sentence[sentence[subj_word]['head']]['form'].lower(), sentence[obj_word]['form'].lower())
                            if triple in triples[file]:
                                triples[file][triple] += 1
                            else:
                                triples[file][triple] = 1
    return triples

def get_number_subject_verb_object(triples):
    counter = dict()
    for file in triples.keys():
        file_counter = 0
        for triple in triples[file]:
            file_counter += triples[file][triple]
        counter[file] = file_counter
    return counter

if __name__ == '__main__':
    triples = extract_subject_verb_object_from(files_corpus)
    triples_counter = get_number_subject_verb(pairs)
    for file in triples.keys():
        print("[File] " + file + ": \n[Total] " + str(triples_counter[file]))
        triples_sorted_values = sorted(triples[file].values(), reverse=True)
        triples_sorted_keys = sorted(triples[file], key=triples[file].get, reverse=True)
        for i in range(5):
            try:
                print('(' + triples_sorted_keys[i][0] + ',' + triples_sorted_keys[i][1] + ',' + triples_sorted_keys[i][2] + ')' + ' --> ' + str(triples_sorted_values[i]))
            except:
                break

.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Ancient_Greek/grc-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Ancient_Greek-PROIEL/grc_proiel-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Arabic/ar-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Basque/eu-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Bulgarian/bg-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Catalan/ca-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Chinese/zh-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Croatian/hr-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Czech/cs-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Czech-CAC/cs_cac-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Czech-CLTT/cs_cltt-ud-train.conllu
.//Universal Dependencies 2.0/ud-treebanks-con

(ノッチ,用いる,電力) --> 3
(アギス,就任,リュサンドロス) --> 2
(太平洋側,呼びかけ,警戒) --> 2
(霊芝,発揮,効果) --> 2
(員,研究,力) --> 2
[File] .//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Kazakh/kk-ud-train.conllu: 
[Total] 49
(шолпан,қылмады,бала) --> 1
(шолпан,қылсын,сөзін) --> 1
(шолпан,істемеді,не) --> 1
(жасаған,бермеді,бала) --> 1
(тәңірінің,бермеуі,бала) --> 1
[File] .//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Korean/ko-ud-train.conllu: 
[Total] 5404
(잡스는,청하고,도움을) --> 1
(워즈니악은,설계해내지만,게임을) --> 1
(잡스는,주었다,350달러만을) --> 1
(김새론은,입었고,쉬폰) --> 1
(이들은,보여주며,정석을) --> 1
[File] .//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Latin/la-ud-train.conllu: 
[Total] 1469
(qui,habet,aurem) --> 5
(spiritus,dicat,quid) --> 5
(res,caperet,quid) --> 2
(pars,debetur,modestiae) --> 1
(mulier,tenebat,quendam) --> 1
[File] .//Universal Dependencies 2.0/ud-treebanks-conll2017/UD_Latin-ITTB/la_ittb-ud-train.conllu: 
[Total] 17436
(agens,agit,simile) --> 12
(res,habet,esse) --> 11
(forma,habet,esse) --> 9
(omnia,appetun