In [1]:
import os
from conllu import parse_tree, parse, parse_incr
from tqdm import tqdm_notebook
import opencorpora
import pyconll
import matplotlib.pyplot as plt
import pickle 
from collections import defaultdict
import string

In [26]:
# Load GSD to SynTagRus mapping
with open('mapping_thresh5.pickle', 'rb') as f:
    mapping = pickle.load(f)

In [27]:
mapping = sorted(mapping)

In [28]:
mapping

[('ADJ', 'A'),
 ('ADJ', 'ADV'),
 ('ADJ', 'NUM'),
 ('ADJ', 'S'),
 ('ADJ', 'V'),
 ('ADP', 'ADV'),
 ('ADP', 'PR'),
 ('ADP', 'S'),
 ('ADV', 'A'),
 ('ADV', 'ADV'),
 ('ADV', 'CONJ'),
 ('ADV', 'PART'),
 ('ADV', 'PR'),
 ('ADV', 'S'),
 ('AUX', 'V'),
 ('CCONJ', 'ADV'),
 ('CCONJ', 'CONJ'),
 ('CCONJ', 'PART'),
 ('CCONJ', 'S'),
 ('DET', 'A'),
 ('DET', 'PART'),
 ('DET', 'S'),
 ('NOUN', 'A'),
 ('NOUN', 'ADV'),
 ('NOUN', 'COM'),
 ('NOUN', 'NID'),
 ('NOUN', 'PART'),
 ('NOUN', 'PR'),
 ('NOUN', 'S'),
 ('NOUN', 'V'),
 ('NUM', 'A'),
 ('NUM', 'ADV'),
 ('NUM', 'NUM'),
 ('NUM', 'S'),
 ('PART', 'A'),
 ('PART', 'CONJ'),
 ('PART', 'NID'),
 ('PART', 'PART'),
 ('PART', 'S'),
 ('PRON', 'A'),
 ('PRON', 'PART'),
 ('PRON', 'S'),
 ('PROPN', 'A'),
 ('PROPN', 'NID'),
 ('PROPN', 'PART'),
 ('PROPN', 'S'),
 ('PROPN', 'V'),
 ('SCONJ', 'ADV'),
 ('SCONJ', 'CONJ'),
 ('SCONJ', 'PART'),
 ('SCONJ', 'S'),
 ('VERB', 'A'),
 ('VERB', 'S'),
 ('VERB', 'V'),
 ('X', 'NID'),
 ('X', 'NUM'),
 ('X', 'S')]

In [40]:
gsd2syntagrus = defaultdict(list)
for bundled in mapping:
    gsd2syntagrus[bundled[0]].append(bundled[1])

In [29]:
syntagrus2gsd = defaultdict(list)
for bundled in mapping:
    syntagrus2gsd[bundled[1]].append(bundled[0])

In [27]:
gsd2syntagrus

defaultdict(list,
            {'ADJ': ['A', 'ADV', 'NUM', 'S', 'V'],
             'ADP': ['ADV', 'PR', 'S'],
             'ADV': ['A', 'ADV', 'CONJ', 'PART', 'PR', 'S'],
             'AUX': ['V'],
             'CCONJ': ['ADV', 'CONJ', 'PART', 'S'],
             'DET': ['A', 'PART', 'S'],
             'NOUN': ['A', 'ADV', 'COM', 'NID', 'PART', 'PR', 'S', 'V'],
             'NUM': ['A', 'ADV', 'NUM', 'S'],
             'PART': ['A', 'CONJ', 'NID', 'PART', 'S'],
             'PRON': ['A', 'PART', 'S'],
             'PROPN': ['A', 'NID', 'PART', 'S', 'V'],
             'SCONJ': ['ADV', 'CONJ', 'PART', 'S'],
             'VERB': ['A', 'S', 'V'],
             'X': ['NID', 'NUM', 'S']})

## Create GSD datasets

In [34]:
path_to_data = '/home/ubuntu/masters/data/UD_Russian-GSD/'
filenames = ['ru_gsd-ud-train.conllu', 'ru_gsd-ud-dev.conllu', 'ru_gsd-ud-test.conllu']
path_to_train, path_to_val, path_to_test = [os.path.join(path_to_data, filename) for filename in filenames]

In [35]:
train = pyconll.load_from_file(path_to_train)
val = pyconll.load_from_file(path_to_val)
test = pyconll.load_from_file(path_to_test)

In [36]:
def create_dataset(conll_data):
    data = []
    for sentence in tqdm_notebook(conll_data):
        current_sentence_info = []
        for token in sentence:
            current_sentence_info.append((token.form, token.upos))
        data.append(current_sentence_info)
    
    return data

In [37]:
train_data = create_dataset(train)
val_data = create_dataset(val)
test_data = create_dataset(test)

HBox(children=(IntProgress(value=0, max=3850), HTML(value='')))




HBox(children=(IntProgress(value=0, max=579), HTML(value='')))




HBox(children=(IntProgress(value=0, max=601), HTML(value='')))




In [38]:
def change_to_bundled(data, gsd2syntagrus):
    for sentence in data:
        for i in range(len(sentence)):
            word = sentence[i][0]
            pos = sentence[i][1]
            bundled_tags = [[pos, syntagrus_pos] for syntagrus_pos in gsd2syntagrus[pos]]
            sentence[i] = [word, bundled_tags]
    return data

In [41]:
train_data = change_to_bundled(train_data, gsd2syntagrus)
val_data = change_to_bundled(val_data, gsd2syntagrus)
test_data = change_to_bundled(test_data, gsd2syntagrus)

In [54]:
def deal_w_punct(data):
    for sentence in data:
        for word in sentence:
            if word[0] in string.punctuation:
                word[1] = ['PUNCT', 'PUNCT']
    return data

In [55]:
train_data = deal_w_punct(train_data)
val_data = deal_w_punct(val_data)
test_data = deal_w_punct(test_data)

In [57]:
def save_data(data, path_to_save):
    with open(path_to_save, 'wb') as f:
        pickle.dump(data, f)

In [58]:
pickle_names = ['gsd_train_bundled.pickle', 'gsd_val_bundled.pickle', 'gsd_test_bundled.pickle']
save_paths = [os.path.join(path_to_data, filename) for filename in pickle_names]
for data, path in zip([train_data, val_data, test_data], save_paths):
    save_data(data, path)

## Create SynTagRus datasets

In [2]:
path_to_corpus = '/home/ubuntu/masters/data/syntagrus/SynTagRus2018/'
paths_to_files_syntagrus = []
for path, subdirs, files in os.walk(path_to_corpus):
    for name in files:
        paths_to_files_syntagrus.append(os.path.join(path, name))

In [11]:
data = []
for file in tqdm_notebook(paths_to_files_syntagrus):
    file_lines = []
    with open(file, 'r') as f:
        for line in f:
            file_lines.append(line)
            
    for line in file_lines:
        # Beginning of the sentence
        if line.startswith('<S'):
            data.append([])
        # Any word of sentence
        if line.startswith('<W'):
            word = line.split('>')[1].split('<')[0]
            pos = line.split(' ')[2].split('FEAT="')[-1]
            
            # Fix labeling bug with extra "
            if pos.endswith("\""):
                pos = pos[:-1]
            
            # Add to dataset
            data[-1].append([word, pos])
                
            # Check for punctuation
            if line.split('</W>')[-1].split('\n')[0].strip() in string.punctuation and \
               len(line.split('</W>')[-1].split('\n')[0].strip()) != 0:
            #if line.split('</W>')[-1].split(' ')[0] in string.punctuation and len(line.split('</W>')[-1].split('\n')[0]) != 0:
                punct = line.split('</W>')[-1].split('\n')[0].strip()
                data[-1].append([punct, 'PUNCT'])

HBox(children=(IntProgress(value=0, max=692), HTML(value='')))




In [24]:
len(data)

73909

In [25]:
train_data_syntagrus = data[:50000]
val_data_syntagrus = data[50000:60000]
test_data_syntagrus = data[60000:]

In [32]:
train_data_syntagrus = change_to_bundled(train_data_syntagrus, syntagrus2gsd)
val_data_syntagrus = change_to_bundled(val_data_syntagrus, syntagrus2gsd)
test_data_syntagrus = change_to_bundled(test_data_syntagrus, syntagrus2gsd)

In [59]:
train_data_syntagrus = deal_w_punct(train_data_syntagrus)
val_data_syntagrus = deal_w_punct(val_data_syntagrus)
test_data_syntagrus = deal_w_punct(test_data_syntagrus)

In [61]:
path_to_data_syntagrus = '/home/ubuntu/masters/data/syntagrus/'

In [63]:
pickle_names = ['syntagrus_train_bundled.pickle', 'syntagrus_val_bundled.pickle', 'syntagrus_test_bundled.pickle']
save_paths = [os.path.join(path_to_data_syntagrus, filename) for filename in pickle_names]
for data, path in zip([train_data_syntagrus, val_data_syntagrus, test_data_syntagrus], save_paths):
    save_data(data, path)