## Spacy with coref via coreferee

You will probably need to do the following to run this
 * pip install coreferee
 * ...

In [1]:
import spacy
from spacy import displacy
import coreferee

from importlib import reload # while developing

### Load Spacy's big model, s recommended by coreferee

In [2]:
nlp = spacy.load("en_core_web_trf")
print(nlp.pipe_names)

spacy_entity_linker
['transformer', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


### Add coreferee to SpaCy's pipeline

In [3]:
nlp.add_pipe('coreferee')
print(nlp.pipe_names)

['transformer', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer', 'coreferee']


### Test coref on a short example

In [4]:
doc = nlp("""Although he was busy with his work, Peter Piper had had enough of it.\
          He decided that he needed a holiday. \
          He travelled to Spain because he loved the country very much. Piper was happy.""")

### Exmine the coref chains

In [5]:
print('Entities:', doc.ents)
print('Coref chains:')
doc._.coref_chains.print()

Entities: (Peter Piper, Spain, Piper)
Coref chains:
0: he(1), his(5), Piper(9), He(17), he(20), He(26), he(31), Piper(38)
1: work(6), it(14)
2: Spain(29), country(34)


### Can it link a long named mention with a shorter one?

In [6]:
doc2 = nlp("""Bill Clinton was fist elected as president in 1992.\
          He was elected again in 1996. Clinton was very popular. Bill was a good politician. """)
print('entities:', doc2.ents)
print('coref  chains:')
doc2._.coref_chains.print()

entities: (Bill Clinton, 1992, 1996, Clinton, Bill)
coref  chains:
0: Clinton(1), He(11), Clinton(18)


### Try it on a English HC4 report for topic 1002

In [7]:
topic_number = "1002"
text = open(file=f'report_data/{topic_number}_report.txt', mode='r', encoding='utf-8').read()
doc1002 = nlp(text)

### Display the text marking its entities and their types.  The default types are the 18 types from [Ontonotes](https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf)

In [8]:
displacy.render(doc1002, style="ent")

### Show the coref chains for the report for topic 1002

In [9]:
doc1002._.coref_chains.print()

0: thousands(325), their(348)
1: Markov(451), his(471)
2: KGB(486), KGB(509)
3: Castro(546), his(554)
4: [Rebet(571); Bandera(574)], their(593)
5: UK(605), UK(648)
6: Litvinenko(611), him(625), his(632), Litvinenko(635), his(659), he(661), his(674), Litvinenko(685)
7: Russia(655), Russia(668)
8: Putin(678), he(680)


fin