In [34]:
import os
import json
import random
import networkx as nx
from networkx import json_graph
import stanza
from collections import Counter, defaultdict
import itertools
# a list of selected paths. We do not publish texts according to the copyright
from data import SELECTED_REAL_PATHS 

In [8]:
nlp = stanza.Pipeline(lang='ru')

2021-05-24 19:46:50 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |
| lemma     | syntagrus |
| depparse  | syntagrus |
| ner       | wikiner   |

2021-05-24 19:46:50 INFO: Use device: cpu
2021-05-24 19:46:50 INFO: Loading: tokenize
2021-05-24 19:46:50 INFO: Loading: pos
2021-05-24 19:46:51 INFO: Loading: lemma
2021-05-24 19:46:52 INFO: Loading: depparse
2021-05-24 19:46:53 INFO: Loading: ner
2021-05-24 19:47:01 INFO: Done loading processors!


In [11]:
NEW_GRAPHS_DIR = 'D:\\Диплом_текстовые_квесты\\Data\\new_questbook_graphs'

In [12]:
os.listdir(NEW_GRAPHS_DIR)

['Alien_The_Last_Escape.json',
 'cyberpunk.json',
 'game10034.json',
 'game10075.json',
 'game10078.json',
 'game10092.json',
 'game10099.json',
 'game10106.json',
 'game10120.json',
 'game10163.json',
 'game10164.json',
 'game10225.json',
 'game10229.json',
 'game10234.json',
 'game10302.json',
 'game10365.json',
 'game9464.json',
 'game9723.json',
 'game9771.json',
 'game9776.json',
 'game9800.json',
 'game9816.json',
 'game9963.json',
 'game9974.json',
 'Gmanjob.json',
 'Ksnsndkj223.json',
 'Luna-park_dlya_smelchakov.json',
 'NSS.json',
 'ostrov_osminogov.json',
 'Peshchera_Vremeni.json',
 'prosto2002.json',
 'Puteshestviye_na_dno_morya_.json',
 'Tesskyrim.json',
 'V_poyezde_s_vampirami.json']

In [13]:
with open('D:\\Диплом_текстовые_квесты\\Data\\new_questbook_graphs\\Alien_The_Last_Escape.json') as f:
    test = json.load(f)

In [14]:
test.keys()

dict_keys(['directed', 'multigraph', 'graph', 'nodes', 'links'])

In [15]:
test['nodes'][0]

{'fragment_text': 'Дорогие друзья!\nПеред вами сторигейм - [b]Чужой: Последний выход[/b].\nОн создан по мотивам фильма и книги "Чужой", и повествует альтернативную историю о встрече Ностромо с LV-426.',
 'id': '38e6c3ca-5be5-11eb-bd01-002590e2f74e'}

In [27]:
def generate_random_data(graphs_directory, length):
    chosen_quests_files = random.choices(os.listdir(graphs_directory), k=length)
    random_data = []
    for chosen_quest_file in chosen_quests_files:
        with open(os.path.join(graphs_directory, chosen_quest_file)) as f:
            graph_data = json.load(f)
            random.shuffle(graph_data['nodes'])
            for node in graph_data['nodes']:
                if 'fragment_text' in node and isinstance(node['fragment_text'], str) and node['fragment_text'].strip():
                    random_data.append(node['fragment_text'])
                    break
    return random_data

In [28]:
random_data = []
for real_path in SELECTED_REAL_PATHS:
    random_data.append(generate_random_data(NEW_GRAPHS_DIR, len(real_path)))

In [31]:
with open('D:\\Диплом_текстовые_квесты\\Квесты, отобранные для оценки адекватности\\sanity_check_data.json', 'w') as f:
    json.dump(
        {
        'real': SELECTED_REAL_PATHS,
        'random': random_data
        },
    f)
    

In [4]:
with open('D:\\Диплом_текстовые_квесты\\Квесты, отобранные для оценки адекватности\\sanity_check_data.json') as f:
    sanity_check_data = json.load(f)

In [9]:
def word2json(word:stanza.models.common.doc.Word):
    word_json = {}
    for field in ['text','lemma', 'upos', "feats", 'deprel']:
        word_json[field] = getattr(word, field)
    return word_json

def dependency2json(dependency):
    return [word2json(dependency[0]), dependency[1], word2json(dependency[2])]

def text2nouns_and_verbs_data(text):
    nouns = []
    pronouns = []
    verbal_deps = []
    if isinstance(text, str):
        for sent in nlp(text).sentences:
            sent_nouns = [word2json(dependency[2]) for dependency in sent.dependencies if dependency[2].upos in ['NOUN', 'PROPN']]
            nouns += sent_nouns
            sent_pronouns = [word2json(dependency[2]) for dependency in sent.dependencies if dependency[2].upos == "PRON"]
            pronouns += sent_pronouns
            sent_verbal_deps = [dependency2json(dependency) for dependency in sent.dependencies if dependency[2].upos == "VERB"]
            verbal_deps += sent_verbal_deps
    return {'nouns': nouns, 'pronouns': pronouns, 'verbal_deps': verbal_deps}

In [13]:
def create_pseudograph(sequence):
    graph = nx.Graph()
    for i, text in enumerate(sequence):
        graph.add_node(i, joined_text=text, joined_morphodata=text2nouns_and_verbs_data(text))
    return graph

In [14]:
test = create_pseudograph(sanity_check_data['real'][0])

In [31]:
def choose_tag(fragment_noun_tags):
    if 'subject' in fragment_noun_tags:
        return 'subject'
    elif 'object' in fragment_noun_tags:
        return 'object'
    else:
        return 'other_dep'

def quest_path2entity_graph(texts):
    G = nx.Graph()
    
    noun_lemmas_counter = Counter()
    
    morphograph = create_pseudograph(texts)
    path = list(range(len(texts)))
    nodes = morphograph.nodes()
    
    node2chosen_syntactic_tags = {}
    nouns2nodes = defaultdict(set)
    morphology_key = 'joined_morphodata'
    text_key = 'joined_text'
    for node in path:
        current_node2all_syntactic_tags = defaultdict(set)
        for noun_data in nodes[node][morphology_key]['nouns']:
            noun_lemma = noun_data['lemma']
            noun_lemmas_counter.update([noun_lemma])
            nouns2nodes['noun_'+noun_lemma].add(node)
            dep_type = 'other_dep'
            dependency = noun_data['deprel']
            if 'subj' in dependency:
                dep_type = 'subject'
            elif 'obj' in dependency:
                dep_type = 'object'
            current_node2all_syntactic_tags[noun_lemma].add(dep_type)
        
        node2chosen_syntactic_tags[node] = {noun:choose_tag(current_node2all_syntactic_tags[noun]) for noun in current_node2all_syntactic_tags}
        
    relevant_nouns = [noun for noun in noun_lemmas_counter if noun_lemmas_counter[noun]>1]
    G.add_node('global', fragment_text='UNK')
    for relevant_noun in relevant_nouns:
        G.add_node('noun_'+relevant_noun, fragment_text=relevant_noun)
    for node in path:
        G.add_node(node, fragment_text=nodes[node][text_key])
        G.add_edge(node, 'global', label='global')
        for noun, syntactic_role in node2chosen_syntactic_tags[node].items():
            if noun in relevant_nouns:
                G.add_edge(node, 'noun_'+noun, label=syntactic_role)
    for noun in nouns2nodes:
        entity_neighbours = itertools.combinations(nouns2nodes[noun], 2)
        for pair in entity_neighbours:
            G.add_edge(*pair, label='fragments_pair')
    return G, [['noun_'+relevant_noun for relevant_noun in relevant_nouns]]

In [32]:
def texts2graph_data(texts):
    path = list(range(len(texts)))
    G, nouns = quest_path2entity_graph(texts)
    
    return ['_', path, json_graph.node_link_data(G), nouns]

In [35]:
prepared_sanity_check_data = {
    'real': [texts2graph_data(texts) for texts in sanity_check_data['real']],
    'random': [texts2graph_data(texts) for texts in sanity_check_data['random']]
}

In [36]:
with open('D:\\Диплом_текстовые_квесты\\Квесты, отобранные для оценки адекватности\\sanity_check_data_entity_graph.json', 'w') as f:
    json.dump(prepared_sanity_check_data, f)

In [42]:
test_random = json_graph.node_link_graph(prepared_sanity_check_data['random'][0][2])