In [1]:
import networkx as nx
from networkx.readwrite import json_graph
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from nltk.tokenize import wordpunct_tokenize
from tqdm import tqdm
import os
import json
import itertools
import torch
import pandas as pd

In [2]:
def choose_tag(fragment_noun_tags):
    if 'subject' in fragment_noun_tags:
        return 'subject'
    elif 'object' in fragment_noun_tags:
        return 'object'
    else:
        return 'other_dep'

def quest_path2entity_graph(path, morphograph, morphology_key='node_morphodata', text_key='fragment_text'):
    G = nx.Graph()
    
    noun_lemmas_counter = Counter()
    nodes = morphograph.nodes()
    
    node2chosen_syntactic_tags = {}
    nouns2nodes = defaultdict(set)
    
    for node in path:
        current_node2all_syntactic_tags = defaultdict(set)
        for noun_data in nodes[node][morphology_key]['nouns']:
            noun_lemma = noun_data['lemma']
            noun_lemmas_counter.update([noun_lemma])
            nouns2nodes['noun_'+noun_lemma].add(node)
            dep_type = 'other_dep'
            dependency = noun_data['deprel']
            if 'subj' in dependency:
                dep_type = 'subject'
            elif 'obj' in dependency:
                dep_type = 'object'
            current_node2all_syntactic_tags[noun_lemma].add(dep_type)
        
        node2chosen_syntactic_tags[node] = {noun:choose_tag(current_node2all_syntactic_tags[noun]) for noun in current_node2all_syntactic_tags}
        
    relevant_nouns = [noun for noun in noun_lemmas_counter if noun_lemmas_counter[noun]>1]
    G.add_node('global', fragment_text='UNK')
    for relevant_noun in relevant_nouns:
        G.add_node('noun_'+relevant_noun, fragment_text=relevant_noun)
    for node in path:
        G.add_node(node, fragment_text=nodes[node][text_key])
        G.add_edge(node, 'global', label='global')
        for noun, syntactic_role in node2chosen_syntactic_tags[node].items():
            if noun in relevant_nouns:
                G.add_edge(node, 'noun_'+noun, label=syntactic_role)
    for noun in nouns2nodes:
        entity_neighbours = itertools.combinations(nouns2nodes[noun], 2)
        for pair in entity_neighbours:
            G.add_edge(*pair, label='fragments_pair')
    return G, [['noun_'+relevant_noun for relevant_noun in relevant_nouns]]

In [None]:
no_text_nodes_counter = 0

ONLINE_GRAPHS_DIRECTORY = 'D:\Диплом_текстовые_квесты\Data\Questbook_online_grammar'
JOINED_TEXT_ONLINE_PATHS_DIRECTORY = 'D:\\Диплом_текстовые_квесты\\Data\\questbook_online_joined_texts'

for file in os.listdir(ONLINE_GRAPHS_DIRECTORY):
    morphograph_path = os.path.join(ONLINE_GRAPHS_DIRECTORY, file)
    joined_graph_path = os.path.join(JOINED_TEXT_ONLINE_PATHS_DIRECTORY, file)
    morpho_graph = json_graph.node_link_graph(json.load(open(morphograph_path, encoding='utf-8')))
    joined_text_graph = json_graph.node_link_graph(json.load(open(joined_graph_path, encoding='utf-8')))
    
    joined_text_nodes = joined_text_graph.nodes()

    for node_idx in joined_text_nodes:
        if 'fragment_text' in joined_text_nodes[node_idx]:
            joined_text = joined_text_nodes[node_idx]['fragment_text'] 
            attrs[node_idx] = {'joined_text':joined_text}
        else:
            no_text_nodes_counter += 1
    nx.set_node_attributes(morpho_graph, attrs)
    new_morphograph_json = json_graph.node_link_data(morpho_graph)
    with open(morphograph_path, 'w', encoding='utf-8') as f:
        json.dump(new_morphograph_json, f)
    

In [27]:
no_text_nodes_counter 

2

In [29]:
morpho_graph = json_graph.node_link_graph(json.load(open(morphograph_path, encoding='utf-8')))

In [None]:
morpho_graph.nodes()

In [18]:
BOOK_GRAPHS_DIRECTORY = 'D:\Диплом_текстовые_квесты\Data\quest_books_graphs_morphology'
ONLINE_GRAPHS_DIRECTORY = 'D:\Диплом_текстовые_квесты\Data\Questbook_online_grammar'

dev_online_graphs, test_online_graphs = train_test_split(os.listdir(ONLINE_GRAPHS_DIRECTORY), random_state=42, test_size=0.4)

all_dev_graphs = [os.path.join(BOOK_GRAPHS_DIRECTORY, book_graph) for book_graph in os.listdir(BOOK_GRAPHS_DIRECTORY)] + [os.path.join(ONLINE_GRAPHS_DIRECTORY, online_graph) for online_graph in dev_online_graphs]
book_graphs = [os.path.join(BOOK_GRAPHS_DIRECTORY, book_graph) for book_graph in os.listdir(BOOK_GRAPHS_DIRECTORY)]
online_graphs = [os.path.join(ONLINE_GRAPHS_DIRECTORY, online_graph) for online_graph in os.listdir(ONLINE_GRAPHS_DIRECTORY)]
test_graphs_paths = [os.path.join(ONLINE_GRAPHS_DIRECTORY, test_graph) for test_graph in test_online_graphs]

train_paths, valid_paths = train_test_split(all_dev_graphs, random_state=42, test_size=0.1)

In [5]:
test = json.load(open('D:\\Диплом_текстовые_квесты\\Data\\Questbook_online_grammar\\admin.json'))

In [4]:
def correct_path(path, graph, morphodata_field, text_field='fragment_text'):
    nodes = graph.nodes()
    correct_path = [fragment_id for fragment_id in path if  morphodata_field in nodes[fragment_id] and text_field in nodes[fragment_id] and isinstance(nodes[fragment_id][text_field], str)]
    return correct_path

In [6]:
BOOK_PATHS_DIRECTORY = os.path.join('D:\\', 'Диплом_текстовые_квесты', 'Data', 'book_paths')
ONLINE_PATHS_DIRECTORY = os.path.join('D:\\', 'Диплом_текстовые_квесты', 'random_paths')

In [37]:
test = json.load(open(os.path.join('D:\\Диплом_текстовые_квесты\\Data\\questbook_online_joined_texts', 'admin.json'), encoding='utf-8'))

In [18]:
test['nodes'][1]['fragment_text']

'К вам подходит бородатый мужчина в рабочем комбинезоне. Из его невнятной речи вы понимаете, что ваш корабль готов к взлёту, но в нём осталось свободное место и вы можете загрузить дополнительный груз. Загрузите больше металла Загрузите больше минералов Загрузите больше топлива'

In [15]:
quest_graph.nodes()[1].keys()

dict_keys(['fragment_text', 'node_morphodata'])

In [35]:
for i, train_graph_path in enumerate(train_paths):
    quest_graph = json_graph.node_link_graph(json.load(open(train_graph_path, encoding='utf-8')))
    try:
        if 'quest_books' in train_graph_path:
            random_walk_paths_path = os.path.join(BOOK_PATHS_DIRECTORY, os.path.basename(train_graph_path))
            morphology_key = 'node_morphodata'
            text_key='fragment_text'
        else:
            random_walk_paths_path = os.path.join(ONLINE_PATHS_DIRECTORY, os.path.basename(train_graph_path))
            morphology_key = 'joined_morphodata'
            text_key='joined_text'
        random_paths = json.load(open(random_walk_paths_path, encoding='utf-8'))
        for path in random_paths:
            corrected_path = correct_path(path, quest_graph, morphodata_field=morphology_key, text_field=text_key)
            if len(corrected_path) > 3:
                entity_graph, noun_nodes = quest_path2entity_graph(corrected_path, quest_graph, morphology_key=morphology_key, text_key=text_key)
        jsonified_data = [train_graph_path, corrected_path, json_graph.node_link_data(entity_graph), noun_nodes]
        with open(os.path.join(TOKENIZED_TRAIN_DIR, str(i)+'.json'), 'w', encoding='utf-8') as f:
            json.dump(jsonified_data, f)
    except OSError:
        print(train_graph_path)

In [61]:
for i, test_graph_path in enumerate(test_graphs_paths):
    try:
        quest_graph = json_graph.node_link_graph(json.load(open(test_graph_path, encoding='utf-8')))
        if 'quest_books' in test_graph_path:
            random_walk_paths_path = os.path.join(BOOK_PATHS_DIRECTORY, os.path.basename(test_graph_path))
            morphology_key = 'node_morphodata'
            text_key='fragment_text'
        else:
            random_walk_paths_path = os.path.join(ONLINE_PATHS_DIRECTORY, os.path.basename(test_graph_path))
            morphology_key = 'joined_morphodata'
            text_key='joined_text'
        random_paths = json.load(open(random_walk_paths_path, encoding='utf-8'))
        for path in random_paths:
            corrected_path = correct_path(path, quest_graph, morphodata_field=morphology_key)
            if len(corrected_path) > 3:
                entity_graph, noun_nodes = quest_path2entity_graph(corrected_path, quest_graph, morphology_key=morphology_key, text_key=text_key)
                jsonified_data = [test_graph_path, corrected_path, json_graph.node_link_data(entity_graph), noun_nodes]
        with open(os.path.join(TOKENIZED_TEST_DIR, str(i)+'.json'), 'w', encoding='utf-8') as f:
            json.dump(jsonified_data, f)
    except Exception as e:
        print(e)
        print(test_graph_path)

Expecting ',' delimiter: line 1 column 842655 (char 842654)
D:\Диплом_текстовые_квесты\Data\Questbook_online_grammar\game9081.json


In [34]:
TOKENIZED_TRAIN_DIR = 'D:\\Диплом_текстовые_квесты\\Data\\tokenized_ordering_train'

In [27]:
TOKENIZED_TEST_DIR = 'D:\\Диплом_текстовые_квесты\\Data\\tokenized_ordering_test_joined'

In [15]:
paths_statistics = pd.DataFrame(columns=['num_nouns', 'num_fragments', 'num_edges_between_fragments', 'quest_type'])

In [16]:
for book_path in tqdm(book_graphs):
    try:
        quest_graph = json_graph.node_link_graph(json.load(open(book_path, encoding='utf-8')))
        random_walk_paths_path = os.path.join(BOOK_PATHS_DIRECTORY, os.path.basename(book_path))
        random_paths = json.load(open(random_walk_paths_path, encoding='utf-8'))
        for path in random_paths:
            corrected_path = correct_path(path, quest_graph, morphodata_field='node_morphodata')
            if len(corrected_path) > 3:
                entity_graph, noun_nodes = quest_path2entity_graph(corrected_path, quest_graph, morphology_key='node_morphodata', text_key='fragment_text')
                
                num_nouns = len(noun_nodes[0])
                num_fragments = len(corrected_path)
                edges = entity_graph.edges()
                num_fragment_edges = len([edge for edge in edges if edges[edge]['label']=='fragments_pair'])
                
                paths_statistics.loc[len(paths_statistics)] = [num_nouns, num_fragments, num_fragment_edges, 'book']
            
            
    except Exception as e:
        print(e)
        print(book_path)
        break
    

100%|████████████████████████████████████████████████████████████████████████████████| 168/168 [01:41<00:00,  1.66it/s]


In [19]:
for online_path in tqdm(online_graphs):
    try:
        quest_graph = json_graph.node_link_graph(json.load(open(online_path , encoding='utf-8')))
        random_walk_paths_path = os.path.join(ONLINE_PATHS_DIRECTORY, os.path.basename(online_path))
        random_paths = json.load(open(random_walk_paths_path, encoding='utf-8'))
        for path in random_paths:
            corrected_path = correct_path(path, quest_graph, morphodata_field='joined_morphodata')
            if len(corrected_path) > 3:
                entity_graph, noun_nodes = quest_path2entity_graph(corrected_path, quest_graph, morphology_key='joined_morphodata', text_key='joined_text')
                
                num_nouns = len(noun_nodes[0])
                num_fragments = len(corrected_path)
                edges = entity_graph.edges()
                num_fragment_edges = len([edge for edge in edges if edges[edge]['label']=='fragments_pair'])
                
                paths_statistics.loc[len(paths_statistics)] = [num_nouns, num_fragments, num_fragment_edges, 'online']
            
            
    except Exception as e:
        print(e)
        print(book_path)
    

 73%|██████████████████████████████████████████████████████████▏                     | 131/180 [01:26<00:29,  1.66it/s]

Expecting ',' delimiter: line 1 column 842655 (char 842654)
D:\Диплом_текстовые_квесты\Data\quest_books_graphs_morphology\Элгар Флетч.json


100%|████████████████████████████████████████████████████████████████████████████████| 180/180 [01:53<00:00,  1.58it/s]


In [41]:
def proportion_of_real_edges(num_fragments, num_edges):
    num_combinations = 0.5*num_fragments * (num_fragments-1)
    return num_edges/num_combinations

Unnamed: 0,num_nouns,num_fragments,num_edges_between_fragments,quest_type
0,27,4,6,book
1,32,6,10,book
2,14,4,3,book
3,19,5,6,book
4,14,4,2,book
...,...,...,...,...
18248,236,37,528,online
18249,82,11,40,online
18250,201,31,344,online
18251,111,13,62,online


In [49]:
paths_statistics['num_nouns'] = pd.to_numeric(paths_statistics['num_nouns'])
paths_statistics['num_fragments'] = pd.to_numeric(paths_statistics['num_fragments'])
paths_statistics['num_edges_between_fragments'] = pd.to_numeric(paths_statistics['num_edges_between_fragments'])

In [None]:
paths_statistics['proportion_of_real_edges'] = paths_statistics.apply(lambda row: proportion_of_real_edges(row['num_fragments'], row['num_edges_between_fragments']), axis=1) 

In [53]:
paths_statistics

Unnamed: 0,num_nouns,num_fragments,num_edges_between_fragments,quest_type,proportion_of_real_edges
0,27,4,6,book,1.000000
1,32,6,10,book,0.666667
2,14,4,3,book,0.500000
3,19,5,6,book,0.600000
4,14,4,2,book,0.333333
...,...,...,...,...,...
18248,236,37,528,online,0.792793
18249,82,11,40,online,0.727273
18250,201,31,344,online,0.739785
18251,111,13,62,online,0.794872


In [51]:
paths_statistics.groupby('quest_type')['num_nouns'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
quest_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
book,9727.0,64.079058,74.817769,0.0,20.0,38.0,74.0,693.0
online,8526.0,104.424701,117.145277,0.0,37.0,70.0,132.0,828.0


In [55]:
paths_statistics.groupby('quest_type')['proportion_of_real_edges'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
quest_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
book,9727.0,0.69264,0.225236,0.0,0.51676,0.7,0.892857,1.0
online,8526.0,0.733865,0.206748,0.0,0.604987,0.777778,0.896104,1.0


In [57]:
full_quests_statistics = pd.DataFrame(columns=['num_nouns', 'num_fragments', 'num_edges_between_fragments', 'quest_type'])

In [59]:
for book_path in tqdm(book_graphs):
    try:
        quest_graph = json_graph.node_link_graph(json.load(open(book_path, encoding='utf-8')))
        pseudo_path = [node for node in quest_graph.nodes()]
        
        corrected_pseudo_path = correct_path(pseudo_path, quest_graph, morphodata_field='node_morphodata')
        if len(corrected_pseudo_path) > 3:
            entity_graph, noun_nodes = quest_path2entity_graph(corrected_pseudo_path, quest_graph, morphology_key='node_morphodata', text_key='fragment_text')
                
            num_nouns = len(noun_nodes[0])
            num_fragments = len(corrected_pseudo_path)
            edges = entity_graph.edges()
            num_fragment_edges = len([edge for edge in edges if edges[edge]['label']=='fragments_pair'])
                
            full_quests_statistics.loc[len(full_quests_statistics)] = [num_nouns, num_fragments, num_fragment_edges, 'book']
            
            
    except Exception as e:
        print(e)
        print(book_path)


100%|████████████████████████████████████████████████████████████████████████████████| 168/168 [01:05<00:00,  2.57it/s]


In [61]:
for online_path in tqdm(online_graphs):
    try:
        quest_graph = json_graph.node_link_graph(json.load(open(online_path, encoding='utf-8')))
        pseudo_path = [node for node in quest_graph.nodes()]
        
        corrected_pseudo_path = correct_path(pseudo_path, quest_graph, morphodata_field='joined_morphodata')
        if len(corrected_pseudo_path) > 3:
            entity_graph, noun_nodes = quest_path2entity_graph(corrected_pseudo_path, quest_graph, morphology_key='joined_morphodata', text_key='joined_text')
                
            num_nouns = len(noun_nodes[0])
            num_fragments = len(corrected_pseudo_path)
            edges = entity_graph.edges()
            num_fragment_edges = len([edge for edge in edges if edges[edge]['label']=='fragments_pair'])
                
            full_quests_statistics.loc[len(full_quests_statistics)] = [num_nouns, num_fragments, num_fragment_edges, 'online']
            
            
    except Exception as e:
        print(e)
        print(online_path)


 74%|███████████████████████████████████████████████████████████▌                    | 134/180 [00:19<00:03, 12.21it/s]

Expecting ',' delimiter: line 1 column 842655 (char 842654)
D:\Диплом_текстовые_квесты\Data\Questbook_online_grammar\game9081.json


100%|████████████████████████████████████████████████████████████████████████████████| 180/180 [00:24<00:00,  7.27it/s]


In [63]:
full_quests_statistics['num_nouns'] = pd.to_numeric(full_quests_statistics['num_nouns'])
full_quests_statistics['num_fragments'] = pd.to_numeric(full_quests_statistics['num_fragments'])
full_quests_statistics['num_edges_between_fragments'] = pd.to_numeric(full_quests_statistics['num_edges_between_fragments'])

In [64]:
full_quests_statistics['proportion_of_real_edges'] = full_quests_statistics.apply(lambda row: proportion_of_real_edges(row['num_fragments'], row['num_edges_between_fragments']), axis=1) 

In [65]:
full_quests_statistics.groupby('quest_type')['num_nouns'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
quest_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
book,167.0,718.401198,525.994927,34.0,343.0,624.0,930.0,2390.0
online,178.0,264.005618,259.891383,0.0,90.25,182.5,387.25,1864.0


In [66]:
full_quests_statistics.groupby('quest_type')['proportion_of_real_edges'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
quest_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
book,167.0,0.551704,0.279289,0.044601,0.351866,0.51074,0.812123,1.0
online,178.0,0.66235,0.224697,0.0,0.487168,0.677409,0.84791,1.0
