In [4]:
"""
CoNLL-X and CoNLL-U file readers and writers
"""
__author__ = "Pierre Nugues"

import os
import pprint


def get_files(dir, suffix):
    """
    Returns all the files in a folder ending with suffix
    Recursive version
    :param dir:
    :param suffix:
    :return: the list of file names
    """
    files = []
    for file in os.listdir(dir):
        path = dir + '/' + file
        if os.path.isdir(path):
            files += get_files(path, suffix)
        elif os.path.isfile(path) and file.endswith(suffix):
            files.append(path)
    return files


def read_sentences(file):
    """
    Creates a list of sentences from the corpus
    Each sentence is a string
    :param file:
    :return:
    """
    f = open(file).read().strip()
    sentences = f.split('\n\n')
    return sentences


def split_rows(sentences, column_names):
    """
    Creates a list of sentence where each sentence is a list of lines
    Each line is a dictionary of columns
    :param sentences:
    :param column_names:
    :return:
    """
    new_sentences = []
    root_values = ['0', 'ROOT', 'ROOT', 'ROOT', 'ROOT', 'ROOT', '0', 'ROOT', '0', 'ROOT']
    start = [dict(zip(column_names, root_values))]
    for sentence in sentences:
        rows = sentence.split('\n')
        sentence = [dict(zip(column_names, row.split())) for row in rows if row[0] != '#']
        sentence = start + sentence
        new_sentences.append(sentence)
    return new_sentences


def save(file, formatted_corpus, column_names):
    f_out = open(file, 'w')
    for sentence in formatted_corpus:
        for row in sentence[1:]:
            # print(row, flush=True)
            for col in column_names[:-1]:
                if col in row:
                    f_out.write(row[col] + '\t')
                else:
                    f_out.write('_\t')
            col = column_names[-1]
            if col in row:
                f_out.write(row[col] + '\n')
            else:
                f_out.write('_\n')
        f_out.write('\n')
    f_out.close()



column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']

train_file = '../swedish_talbanken05_train.conll.txt'
    # train_file = 'test_x'
test_file = '../swedish_talbanken05_test.conll.txt'

sentences = read_sentences(train_file)
formatted_corpus = split_rows(sentences, column_names_2006)
print(train_file, len(formatted_corpus))
pprint.pprint(formatted_corpus[0])

#column_names_u = ['id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc']

#files = get_files('../train', 'train.conll.txt')
#for train_file in files:
        #sentences = read_sentences(train_file)
        #formatted_corpus = split_rows(sentences, column_names_u)
        #print(train_file, len(formatted_corpus))
        #print(formatted_corpus[0])

../swedish_talbanken05_train.conll.txt 11042
[{'cpostag': 'ROOT',
  'deprel': 'ROOT',
  'feats': 'ROOT',
  'form': 'ROOT',
  'head': '0',
  'id': '0',
  'lemma': 'ROOT',
  'pdeprel': 'ROOT',
  'phead': '0',
  'postag': 'ROOT'},
 {'cpostag': 'NN',
  'deprel': 'SS',
  'feats': '_',
  'form': 'Äktenskapet',
  'head': '4',
  'id': '1',
  'lemma': '_',
  'pdeprel': '_',
  'phead': '_',
  'postag': 'NN'},
 {'cpostag': '++',
  'deprel': '++',
  'feats': '_',
  'form': 'och',
  'head': '3',
  'id': '2',
  'lemma': '_',
  'pdeprel': '_',
  'phead': '_',
  'postag': '++'},
 {'cpostag': 'NN',
  'deprel': 'CC',
  'feats': '_',
  'form': 'familjen',
  'head': '1',
  'id': '3',
  'lemma': '_',
  'pdeprel': '_',
  'phead': '_',
  'postag': 'NN'},
 {'cpostag': 'AV',
  'deprel': 'ROOT',
  'feats': '_',
  'form': 'är',
  'head': '0',
  'id': '4',
  'lemma': '_',
  'pdeprel': '_',
  'phead': '_',
  'postag': 'AV'},
 {'cpostag': 'EN',
  'deprel': 'DT',
  'feats': '_',
  'form': 'en',
  'head': '7',
  'id'

# Extract subject-verb Pairs

Total number of pairs: 18885


NameError: name 'a' is not defined