In [1]:
import spacy

In [2]:
import pandas as pd
import json
import ast

In [3]:
answers = pd.read_json('checkin_answers.jsonl', lines=True)
answers.columns

Index(['text', '_input_hash', '_task_hash', 'spans', 'tokens', '_session_id',
       '_view_id', 'relations', 'answer', 'flagged', 'relation count',
       'complexity', 'original_text', 'document_index', 'md_sentence_index',
       'classification_origin', 'url', 'order', 'Used?', 'Dataset name',
       'ignore?', 'reason', 'original_md_text', 'characters'],
      dtype='object')

In [4]:
answers['spans'].iloc[0]

[{'start': 0, 'end': 4, 'token_start': 0, 'token_end': 0, 'label': 'type_of'},
 {'start': 5, 'end': 11, 'token_start': 1, 'token_end': 1, 'label': 'base'},
 {'start': 16,
  'end': 26,
  'token_start': 3,
  'token_end': 3,
  'label': 'type_of'},
 {'start': 27, 'end': 38, 'token_start': 4, 'token_end': 4, 'label': 'base'},
 {'start': 39,
  'end': 42,
  'token_start': 5,
  'token_end': 5,
  'label': 'confidence'},
 {'start': 43,
  'end': 48,
  'token_start': 6,
  'token_end': 6,
  'label': 'predicate'},
 {'start': 49, 'end': 55, 'token_start': 7, 'token_end': 7, 'label': 'base'},
 {'start': 56,
  'end': 62,
  'token_start': 8,
  'token_end': 8,
  'label': 'aspect_changing'}]

In [5]:
answers['text'].iloc[0]

'Heat stress and persistent dehydration can cause kidney damage. IMPLIED_BASE IMPLIED_BASE'

In [6]:
answers['relations'].iloc[0]

[{'head': 0,
  'child': 1,
  'head_span': {'start': 0,
   'end': 4,
   'token_start': 0,
   'token_end': 0,
   'label': 'type_of'},
  'child_span': {'start': 5,
   'end': 11,
   'token_start': 1,
   'token_end': 1,
   'label': 'base'},
  'color': '#c5bdf4',
  'label': 'Concept_Member'},
 {'head': 3,
  'child': 4,
  'head_span': {'start': 16,
   'end': 26,
   'token_start': 3,
   'token_end': 3,
   'label': 'type_of'},
  'child_span': {'start': 27,
   'end': 38,
   'token_start': 4,
   'token_end': 4,
   'label': 'base'},
  'color': '#c5bdf4',
  'label': 'Concept_Member'},
 {'head': 8,
  'child': 7,
  'head_span': {'start': 56,
   'end': 62,
   'token_start': 8,
   'token_end': 8,
   'label': 'aspect_changing'},
  'child_span': {'start': 49,
   'end': 55,
   'token_start': 7,
   'token_end': 7,
   'label': 'base'},
  'color': '#c5bdf4',
  'label': 'Concept_Member'},
 {'head': 1,
  'child': 7,
  'head_span': {'start': 5,
   'end': 11,
   'token_start': 1,
   'token_end': 1,
   'label': '

In [19]:
from spacy.matcher import Matcher
import networkx as nx
from tqdm import tqdm
tqdm.pandas()

In [46]:
def get_dependency_patterns(answers, head_type='type_of', tail_type='base'):
    patterns = {}
    for _, answer in tqdm(answers.iterrows(), total=answers.shape[0]):
        spans = answer['spans']
        text = answer['text']
        relations = answer['relations']
        nlp = spacy.load('en_core_web_md')
        doc = nlp(text)
        for relation in relations:
            if relation['label'] != 'Concept_Member':
                continue
            head_span = relation['head_span']
            child_span = relation['child_span']
            if head_span['label'] == head_type and child_span['label'] == tail_type:
                src_span = doc[head_span['token_start']: head_span['token_end'] + 1]
                tgt_span = doc[child_span['token_start']: child_span['token_end'] + 1]
                src_root = src_span.root
                tgt_root = tgt_span.root
                if tgt_span.text.lower() == 'implied_base':
                    continue
                edges = []
                for token in doc:
                    for child in token.children:
                        edges.append(('{}-{}'.format(token.lower_,token.i), '{}-{}'.format(child.lower_,child.i))) 
                graph = nx.Graph(edges) 
                path = None
                source = '{}-{}'.format(src_root.lower_, src_root.i)
                target = '{}-{}'.format(tgt_root.lower_, tgt_root.i)
                if nx.has_path(graph, source=source, target=target):
                    path = nx.shortest_path(graph, source=source, target=target)
                if path is not None:
                    for t in src_span:
                        n = '{}-{}'.format(t.lower_, t.i)  
                        if n not in path:
                            path.append(n)
                    for t in tgt_span:
                        n = '{}-{}'.format(t.lower_, t.i)
                        if n not in path:
                            path.append(n)
                    path_nodes = {}
                    for p in path:
                        t, i = p.rsplit('-', 1)
                        i = int(i)
                        if i in range(head_span['token_start'], head_span['token_end'] + 1):
                            t = '<src>'
                        elif i in range(child_span['token_start'], child_span['token_end'] + 1):
                            t = '<tgt>'
                        path_nodes[i] = t
                    path_nodes = sorted(path_nodes.items(), key=lambda x: x[0])
                    pattern = ' '.join([p[1] for p in path_nodes])
                    patterns[pattern] = patterns.get(pattern, 0) + 1
    patterns = {k:v for k,v in patterns.items()}
    patterns = sorted(patterns.items(), key=lambda x: x[1], reverse=True)
    return patterns

In [47]:
get_dependency_patterns(answers, head_type='type_of', tail_type='base')

100%|███████████████████████████████████████████| 60/60 [01:19<00:00,  1.32s/it]


[('<src> <tgt>', 46),
 ('<src> <tgt> <tgt>', 6),
 ('<src> <tgt> <tgt> <tgt> <tgt> <tgt>', 2),
 ('<src> <src> <src> <tgt>', 2),
 ('project <tgt> <tgt> on <src> <src> lead', 1),
 ('<tgt> are <src>', 1),
 ('<src> <src> <tgt>', 1),
 ('due part to loss <src> <src> <src> <src> <tgt>', 1),
 ('acute <src> <tgt>', 1),
 ('pregnancy <src> <tgt>', 1),
 ('neurological <src> <tgt> <tgt> <tgt> <tgt> <tgt>', 1),
 ('<src> water <tgt>', 1),
 ('<src> <tgt> cover', 1),
 ('is first point link between rates <tgt> <tgt> growing frequent <src>', 1),
 ('is first point link between rates <tgt> <tgt> growing <src>', 1)]

In [48]:
get_dependency_patterns(answers, head_type='aspect_changing', tail_type='base')

100%|███████████████████████████████████████████| 60/60 [01:18<00:00,  1.30s/it]


[('<tgt> <src>', 27),
 ('<src> of <tgt>', 14),
 ('<src> to <tgt> <tgt>', 2),
 ('<src> to <tgt>', 2),
 ('<src> of <tgt> <tgt>', 2),
 ('rain <tgt> <src>', 1),
 ('causing <src> in <tgt>', 1),
 ('socioeconomic <tgt> <src>', 1),
 ('<src> <src> <tgt> are', 1),
 ('leave <tgt> with <src> <src> <src> <src> <src>', 1),
 ('is source of <tgt> <tgt> <src> is', 1),
 ('<src> reverberated throughout <tgt>', 1),
 ('<src> of <tgt> stocks', 1),
 ('<tgt> to <src>', 1),
 ('<src> to security <tgt>', 1),
 ('<tgt> <tgt> holds led accelerated <src> <src>', 1),
 ('<src> from <tgt>', 1),
 ('<tgt> <tgt> <src>', 1),
 ('pushing number of <tgt> <src> <src> <src>', 1),
 ('<src> on <tgt>', 1),
 ('water <tgt> <src>', 1),
 ('<src> of waves <tgt>', 1),
 ('<src> of waves droughts <tgt>', 1),
 ('frequency <src> of <tgt> <tgt>', 1),
 ('frequency <src> of waves <tgt>', 1),
 ('frequency <src> of waves droughts <tgt>', 1),
 ('<src> of conflict <tgt>', 1),
 ('<src> <src> demand for <tgt>', 1),
 ('<src> for <tgt>', 1),
 ('<tgt> 

In [49]:
get_dependency_patterns(answers, head_type='change_direction', tail_type='base')

100%|███████████████████████████████████████████| 60/60 [01:17<00:00,  1.30s/it]


[('<src> <tgt>', 16),
 ('<tgt> <src>', 9),
 ('<src> of <tgt>', 4),
 ('<src> <src> <tgt>', 3),
 ('<tgt> level <src>', 2),
 ('<src> risk of <tgt>', 2),
 ('<src> <src> <src> <tgt>', 2),
 ('<src> <src> clouds <tgt> activity', 1),
 ('<src> <src> clouds rain <tgt> activity', 1),
 ('<src> <src> <src> <src> <src> <tgt>', 1),
 ('causing <src> rainfall in <tgt>', 1),
 ('<src> levels of <tgt>', 1),
 ('<src> in <tgt>', 1),
 ('<src> impact <tgt> availability', 1),
 ('growth <src> <tgt>', 1),
 ('<src> period <tgt> are', 1),
 ('<src> <src> spreading <tgt>', 1),
 ('leave <tgt> with <src> notice', 1),
 ('been <src> comes to growth of <tgt>', 1),
 ('poses <src> to viability of <tgt>', 1),
 ('<src> in likelihood of <tgt>', 1),
 ('<tgt> <src> <src> <src>', 1),
 ('<src> sellable <tgt>', 1),
 ('<src> <tgt> quality', 1),
 ('<tgt> <src> <src>', 1),
 ('<src> <tgt> <tgt>', 1),
 ('<src> concentrations of <tgt>', 1),
 ('<src> <src> <tgt> growth', 1),
 ('<src> distributions of <tgt> stocks', 1),
 ('<src> vulnerabi

In [88]:
def get_pos_patterns(answers, head_type='type_of', tail_type='base'):
    patterns = {}
    nlp = spacy.load('en_core_web_md')
    for _, answer in tqdm(answers.iterrows(), total=answers.shape[0]):
        spans = answer['spans']
        text = answer['text']
        relations = answer['relations']
        doc = nlp(text)
        token_tags = {}
        for span in spans:
            if 'token_start' not in span or 'token_end' not in span:
                continue
            token_start = span['token_start']
            token_end = span['token_end']
            tag = span['label']
            for i in range(token_start, token_end+1):
                token_tags[i] = tag
        for relation in relations:
            if relation['label'] != 'Concept_Member':
                continue
            head_span = relation['head_span']
            child_span = relation['child_span']
            if head_span['label'] == head_type and child_span['label'] == tail_type:
                src_span = doc[head_span['token_start']: head_span['token_end'] + 1]
                tgt_span = doc[child_span['token_start']: child_span['token_end'] + 1]

                start_idx = min(head_span['token_start'], child_span['token_start'])
                end_idx = max(head_span['token_end'], child_span['token_end']) + 1

                doc_span = doc[start_idx: end_idx]
                pattern = []
                for token in doc_span:
                    tag = token_tags.get(token.i, 'UNK')
                    elem = '{}_{}'.format(token.pos_, tag)
                    pattern.append(elem)
                pattern = ' '.join(pattern)
                patterns[pattern] = patterns.get(pattern, 0) + 1
                
    patterns = {k:v for k,v in patterns.items()}
    patterns = sorted(patterns.items(), key=lambda x: x[1], reverse=True)
    return patterns

In [89]:
get_pos_patterns(answers, head_type='type_of', tail_type='base')

100%|██████████████████████████████████████████| 60/60 [00:00<00:00, 100.85it/s]


[('ADJ_type_of NOUN_base', 28),
 ('NOUN_type_of NOUN_base', 17),
 ('ADJ_type_of NOUN_base NOUN_base', 5),
 ('ADJ_type_of NOUN_type_of NOUN_base', 2),
 ('NOUN_base NOUN_base PUNCT_UNK ADP_UNK PRON_UNK ADJ_type_of NOUN_type_of',
  1),
 ('ADJ_type_of NOUN_aspect_changing ADP_when NUM_when PUNCT_UNK CCONJ_UNK SYM_effect_size NUM_effect_size NUM_effect_size PUNCT_UNK PUNCT_base NUM_base',
  1),
 ('NOUN_base VERB_UNK ADJ_type_of', 1),
 ('ADV_type_of ADJ_type_of NOUN_base', 1),
 ('NOUN_type_of NOUN_type_of CCONJ_type_of NOUN_type_of NOUN_base', 1),
 ('ADJ_type_of CCONJ_UNK ADJ_type_of NOUN_base', 1),
 ('ADJ_type_of NOUN_base ADP_base DET_base ADJ_base NOUN_base', 1),
 ('ADJ_type_of CCONJ_UNK ADJ_type_of NOUN_base ADP_base DET_base ADJ_base NOUN_base',
  1),
 ('ADJ_type_of CCONJ_UNK ADV_UNK DET_UNK ADJ_type_of CCONJ_UNK ADJ_type_of NOUN_base ADP_base DET_base ADJ_base NOUN_base',
  1),
 ('NOUN_type_of PUNCT_type_of ADJ_type_of NOUN_base', 1),
 ('ADJ_type_of NOUN_type_of NOUN_base CCONJ_UNK NOU

In [91]:
get_pos_patterns(answers, head_type='aspect_changing', tail_type='base')

100%|███████████████████████████████████████████| 60/60 [00:00<00:00, 97.71it/s]


[('NOUN_base NOUN_aspect_changing', 20),
 ('NOUN_aspect_changing ADP_UNK NOUN_base', 11),
 ('ADJ_base NOUN_aspect_changing', 7),
 ('NOUN_aspect_changing ADP_UNK ADJ_type_of NOUN_base', 5),
 ('NOUN_aspect_changing ADP_UNK NOUN_type_of NOUN_base', 3),
 ('NOUN_aspect_changing ADP_UNK NOUN_base NOUN_base', 2),
 ('NOUN_aspect_changing ADP_UNK ADJ_UNK NOUN_base', 1),
 ('NOUN_aspect_changing PUNCT_UNK NUM_base', 1),
 ('NOUN_aspect_changing NOUN_aspect_changing PUNCT_UNK DET_UNK AUX_confidence VERB_predicate ADJ_base CCONJ_UNK ADJ_base NOUN_aspect_changing PUNCT_UNK PUNCT_base',
  1),
 ('ADJ_base CCONJ_UNK ADJ_base NOUN_aspect_changing', 1),
 ('NOUN_aspect_changing CCONJ_UNK NOUN_change_direction NOUN_base DET_UNK AUX_confidence VERB_change_direction NUM_effect_size NOUN_effect_size ADP_UNK ADJ_type_of NOUN_aspect_changing ADP_when NUM_when PUNCT_UNK CCONJ_UNK SYM_effect_size NUM_effect_size NUM_effect_size PUNCT_UNK PUNCT_base',
  1),
 ('NOUN_aspect_changing ADP_when NUM_when PUNCT_UNK CCONJ_

In [92]:
get_pos_patterns(answers, head_type='change_direction', tail_type='base')

100%|██████████████████████████████████████████| 60/60 [00:00<00:00, 101.30it/s]


[('ADJ_change_direction NOUN_base', 6),
 ('VERB_change_direction NOUN_base', 5),
 ('VERB_change_direction NOUN_aspect_changing ADP_UNK NOUN_base', 3),
 ('NOUN_base VERB_change_direction', 3),
 ('VERB_change_direction DET_UNK NOUN_aspect_changing ADP_UNK NOUN_base', 3),
 ('NOUN_base AUX_confidence VERB_change_direction', 2),
 ('VERB_change_direction DET_UNK NOUN_base', 2),
 ('VERB_change_direction NUM_effect_size NOUN_base', 2),
 ('NOUN_change_direction NOUN_base', 2),
 ('NOUN_base NOUN_aspect_changing NOUN_change_direction', 2),
 ('NOUN_base NOUN_change_direction', 2),
 ('VERB_change_direction ADJ_base', 2),
 ('NOUN_change_direction ADP_change_direction NOUN_base', 1),
 ('NOUN_change_direction ADP_change_direction NOUN_base PUNCT_UNK NOUN_base',
  1),
 ('NOUN_change_direction ADP_change_direction NOUN_base PUNCT_UNK NOUN_base CCONJ_UNK NOUN_base',
  1),
 ('ADJ_change_direction PUNCT_change_direction SCONJ_change_direction PUNCT_change_direction ADJ_change_direction NOUN_type_of NOUN_ba