# Coreferencing – proof of concept

In [1]:
import pandas
import spacy
from jsonschema import validate

from typing import List

In [2]:
m_df = pandas.read_json('data/afp_macron1.jsonl', lines=True)

In [3]:
m_df['text'] = m_df['news'].apply(lambda l: "\n".join(_ for _ in l))

In [4]:
nlp = spacy.load('cue_model_afp_gu_lg_dep_pos_830/')



In [5]:
docs = list(nlp.pipe(m_df.text.tolist(), n_process=4))

## Replace 'None' with correct data type

In [6]:
m_df['entity_person'] = m_df['entity_person'].fillna({i: [] for i in m_df.index})
m_df['entity_location'] = m_df['entity_location'].fillna({i: [] for i in m_df.index})
m_df["news"] = m_df['news'].fillna({i: [] for i in m_df.index})
m_df["publisher"] = "AFP"

## Create entity data

In [7]:
import re
def get_paragraph_indices(doc):
#     token_line_breaks = [-1, *[i for i, token in enumerate(doc) if '\n' in token.text], len(doc)-1]  
    token_line_breaks = [-1, *[m.start() for m in re.finditer('\n', doc.text)], len(doc.text)-1] 
    # -1 from start and end so that we get the correct indices when adding 1 in loops below
    paragraphs = []
    for i,x in enumerate(token_line_breaks):
        for j,y in enumerate(token_line_breaks):
            if j == i+1:
                paragraphs.append({'index': i, 'start': x+1, 'end': y+1})
            else:
                continue
    return paragraphs

In [8]:
def get_paragraph_numbers(ent:spacy.tokens.span.Span, pargraph_indices: List[dict]):
    start_char, end_char = ent.start_char, ent.end_char
    start, end = None, None
    para_span_start, para_span_end = None, None
    for p in pargraph_indices:
        para_start_char = p['start']
        para_end_char = p['end']
        if para_start_char <= start_char and para_end_char >= start_char:
            start = p['index']
            para_span_start = start_char - para_start_char
        if para_start_char <= end_char and para_end_char >= end_char:
            end = p['index']    
            para_span_end = end_char - para_start_char
    
    results = {  'paragraph_start': start,
                 'paragraph_end': end,
                 'span_in_paragraph_start': para_span_start,
                 'span_in_paragraph_end': para_span_end
              }
    return results

In [9]:
doc_ents = []
for doc in docs:
    paragraph_indices = get_paragraph_indices(doc)
    doc_ents.append(
        [{'span_label': e.label_, 
          'span_text': e.text,
          'span_start': e.start_char,
          'span_end': e.end_char,
          'span_coref': None,
          **get_paragraph_numbers(e, paragraph_indices)
         } for e in doc.ents])

In [10]:
## Sanity check
indx = 88
span = doc_ents[indx][6]
texts = docs[indx].text.split('\n')

print(texts[span['paragraph_start']:span['paragraph_end']+1])
print("\n".join(
    _ for _ in texts[span['paragraph_start']:span['paragraph_end']+1]
    )[span['span_in_paragraph_start']:span['span_in_paragraph_end']])
print(docs[indx].text[span['span_start']:span['span_end']])

['This uneasy relationship was complicated further when Paris accused Turkish ships of being "extremely aggressive" towards a French navy vessel in June.']
ships
ships


In [11]:
m_df['entities'] = doc_ents

## Reformat data for output

In [12]:
m_df.rename(columns={'uno': 'path', 'created':'publish_date', 'news': 'paragraphs'}, inplace=True)

In [13]:
m_df['metadata'] = m_df[['path', "publisher", "publish_date", "topic", "entity_person", "entity_location", "title"]].to_dict(orient='records')

In [14]:
data = m_df[["metadata", "paragraphs", "entities"]]
data.to_json('data.json', orient='records')

## Validate data against schema

In [15]:
data = data.to_json(orient='records')

In [16]:
import json
with open('../schema/example_schema.json', 'rt') as fin:
    schema = json.load(fin)
schema_columns = list(schema['items']['properties'].keys())

### Validate each array item

In [17]:
_ = [validate(item,schema['items']['properties']) for item in json.loads(data)]

### Validate entire dataset

In [18]:
validate(json.loads(data),schema)

## Visual inspection of data

In [19]:
from spacy import displacy

In [20]:
d = docs[159]
options = {"ents": ["Content", "Source", "Cue"], 
           "colors": {
                        "Content": "#ffe500",
                        "Source": "#ffabdb",
                        "Cue": "#90dcff"}
          }
displacy.render(d, style="ent", options=options)