# Coreference Resolution

In [None]:
###########################################################
# IMPORTANT NOTE:
# In this notebook we use an experimental spaCy model
# named "en_coreference_web_trf", which is not part of the
# official library. At the current date, the official
# documentation is unaccurate regarding the version used
# in this tutorial. We can expect an update on the spaCy's
# documentation: https://spacy.io/api/coref
###########################################################

## #1. Setup development environment

###Update & import Python modules

In [None]:
# install and download spaCy related modules and dependencies
!pip install --upgrade spacy-experimental
!pip install https://github.com/explosion/spacy-experimental/releases/download/v0.6.0/en_coreference_web_trf-3.4.0a0-py3-none-any.whl

# spaCy
import spacy

# Google Drive
from google.colab import drive

# general Python modules
import json
from pprint import pprint

In [None]:
# list all Python packages installed
!pip list

### Get access to Firebase and Drive

In [None]:
# remount drive, forced if needed
drive.mount("/content/gdrive/", force_remount = True)
print("Stablished access to Google Drive")

# initialize Drive path
DRIVE_PATH = "/content/gdrive/My Drive"

Mounted at /content/gdrive/
Stablished access to Google Drive


### Retrieve main data structures

In [None]:
# retrieve Text Record from JSON file
with open(DRIVE_PATH + "/ie_course/assets/text_record.json") as f:
  text_rec = json.load(f)
  print(f"Retrieved text record")

Retrieved text record


## #2. Resolve references


### Create pipeline

In [None]:
nlp = spacy.load("en_coreference_web_trf")

### Coreference resolution

In [None]:
# main_text_woc_container = []
text = text_rec["texts"]

# process paragraphs individually in a stream (multi-thread)
for doc in nlp.pipe(text, batch_size=50):
  spans = doc.spans  # coreference
  pprint(doc.text)
  pprint(spans)
  print()

('Chris, that’s because we have great testing, because we have the best '
 'testing in the world. If we didn’t test, you wouldn’t be able to show that '
 'chart. If we tested half as much, those numbers would be down. We tested-')
{'coref_clusters_1': [we have, we have, we did, we tested, We tested-],
 'coref_clusters_2': [Chris,, you would]}

('No, no. But I don’t say… I say flames. We’ll put out the flames and we’ll '
 'put out, in some cases, just burning embers. We also have burning embers. We '
 'have embers and we do have flames. Florida became more flame-like, but it’s '
 'going to be under control. And it’s not just this country. It’s many '
 'countries. We don’t talk about it in the news. They don’t talk about Mexico, '
 'Mexico and Brazil and still parts of Europe, which actually got hit sooner '
 'than us, so it’s a little ahead of us in that sense. But you take a look, '
 'why don’t they talk about Mexico, which is not helping us? And all I can say '
 'is thank God I built 

## #3. Utils (optional)

### Simple coreference resolution example

In [None]:
doc = nlp("John Smith called from New York. He says it's raining in the city.")
print(doc.spans)

{'coref_clusters_1': [John Smith called, He says], 'coref_clusters_2': [New York., the city.]}


### Analize the pipeline

In [None]:
# see pipeline components
print(nlp.pipe_names)

# analize pipeline
pprint(nlp.analyze_pipes(pretty=True))

['sentencizer', 'transformer', 'coref', 'span_resolver', 'span_cleaner']
[1m

#   Component       Assigns               Requires    Scores          Retokenizes
-   -------------   -------------------   ---------   -------------   -----------
0   sentencizer     token.is_sent_start               sents_f         False      
                    doc.sents                         sents_p                    
                                                      sents_r                    
                                                                                 
1   transformer     doc._.trf_data                                    False      
                                                                                 
2   coref           doc.spans             doc.spans   coref_f         False      
                                                      coref_p                    
                                                      coref_r                    
                   