In [1]:
!nvidia-smi -L

GPU 0: Tesla V100-SXM2-16GB (UUID: GPU-168ec7cb-881a-0c05-3ed6-8f3dd017a1f0)


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Cross-document Coreference Resolution over Predicted Mentions

- Notebook to train the model in paper https://arxiv.org/abs/2106.01210
- Following the repo instructions in https://github.com/ariecattan/coref

### Installs

In [3]:
%%capture
!pip install pyhocon==0.3.51
!pip install transformers==2.8

### Clone repo

In [4]:
!git clone https://github.com/ariecattan/coref

Cloning into 'coref'...
remote: Enumerating objects: 556, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 556 (delta 7), reused 12 (delta 4), pack-reused 532[K
Receiving objects: 100% (556/556), 4.59 MiB | 21.36 MiB/s, done.
Resolving deltas: 100% (331/331), done.


### Process ECB+ data

- The processed data can be found in 'AC295-Wiki-ECB-WEC/Final Deliverables/Coref-for-GPT/Data/processed_ecb/data'

In [5]:
!python /content/coref/get_ecb_data.py --data_path /content/drive/MyDrive/Coref-for-GPT/Data/ECB+

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  delimiter=',', dtype=np.str, skip_header=1)
Traceback (most recent call last):
  File "/content/coref/get_ecb_data.py", line 260, in <module>
    delimiter=',', dtype=np.str, skip_header=1)
  File "/usr/local/lib/python3.7/dist-packages/numpy/lib/npyio.py", line 1793, in genfromtxt
    fid = np.lib._datasource.open(fname, 'rt', encoding=encoding)
  File "/usr/local/lib/python3.7/dist-packages/numpy/lib/_datasource.py", line 193, in open
    return ds.open(path, mode, encoding=encoding, newline=newline)
  File "/usr/local/lib/python3.7/dist-packages/numpy/lib/_datasource.py", line 533, in open
    raise IOError("%s not found." % path)
OSError: /content/drive/MyDrive/Coref-for-GPT/Data/ECB+/ECBplus_coreference_sentences.csv not found.


In [10]:
import shutil
shutil.copytree('data', '/content/drive/MyDrive/AC295-Wiki-ECB-WEC/Final Deliverables/Data/processed_ecb/data')

'/content/drive/MyDrive/AC295-Wiki-ECB-WEC/Final Deliverables/Data/processed_ecb/data'

# Train Model

## Load ECB+ processed data

In [None]:
import shutil
shutil.copytree('/content/drive/MyDrive/AC295-Wiki-ECB-WEC/Final Deliverables/Data/processed_ecb/data', 'data')

## Span Scorer

### Modify Span Scorer config

No need to run next cell if you don't wish to modify the default config in the repo

In [None]:
import json

config_span_scorer = {
  "gpu_num" : 0,

  "bert_model": "roberta-large",
  "hidden_layer": 1024,
  "dropout": 0.3,
  "with_mention_width": True,
  "with_head_attention": True,
  "embedding_dimension": 20,

  "max_mention_span": 10,
  "mention_type": "events",
  "subtopic": True,
  "segment_window": 512,
  "exact": False,

  "random_seed": 0,
  "epochs": 10,
  "batch_size": 16,
  "learning_rate": 1e-4,
  "weight_decay": 0,
  "loss": "bce",
  "optimizer": "adam",
  "adam_epsilon": 1e-8,


  "log_path": "logs/span_scorer",
  "data_folder": "data/ecb/mentions",
  "model_path": "models/span_scorers",
  "exp_num": 0
}

with open('/content/coref/configs/config_span_scorer_colab.json', "w") as outfile:
    json.dump(config_span_scorer, outfile)

### Train Span Scorer

In [None]:
# Need to change name of config file if modified
!python /content/coref/train_span_scorer.py --config /content/coref/configs/config_span_scorer.json

DEBUG:filelock:Attempting to acquire lock 140476708217424 on /root/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.2d28da311092e99a05f9ee17520204614d60b0bfdb32f8a75644df7737b6a748.lock
DEBUG:filelock:Lock 140476708217424 acquired on /root/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.2d28da311092e99a05f9ee17520204614d60b0bfdb32f8a75644df7737b6a748.lock
Downloading: 100% 482/482 [00:00<00:00, 462kB/s]
DEBUG:filelock:Attempting to release lock 140476708217424 on /root/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.2d28da311092e99a05f9ee17520204614d60b0bfdb32f8a75644df7737b6a748.lock
DEBUG:filelock:Lock 140476708217424 released on /root/.cache/torch/transformers/c22e0b5bbb7c0cb93a87a2ae01263ae715b4c18d692b1740ce72cacaa99ad184.2d28da311092e99a05f9ee17520204614d60b0bfdb32f8a75644df7737b6a748.lock
DEBUG:filelock:Attempting to acquire lock 140476710208464 on /roo

### Save model

In [None]:
from shutil import copytree

copytree('models', '/content/drive/MyDrive/Coref-for-GPT/Models/Streamline/models')

'/content/drive/MyDrive/AC295-Wiki-ECB-WEC/streamlining/models'

## Pairwise Scorer

### Modify Pairwise Scorer config

No need to run next cell if you don't wish to modify the default config in the repo

In [None]:
import json

config_pairwise_scorer = {
  "gpu_num" : [0],
  "bert_model": "roberta-large",
  "bert_hidden_size": 1024,
  "hidden_layer": 1024,
  "dropout": 0.3,
  "with_mention_width": True,
  "with_head_attention": True,
  "embedding_dimension": 20,

  "max_mention_span": 10,
  "use_gold_mentions": True,
  "mention_type": "events",
  "top_k": 0.25,
  "training_method": "continue",
  "subtopic": True,
  "use_predicted_topics": False,
  "segment": True,

  "random_seed": 0,
  "epochs": 10,
  "batch_size": 32,
  "learning_rate": 1e-4,
  "weight_decay": 0,
  "loss": "bce",
  "optimizer": "adam",
  "adam_epsilon": 1e-8,
  "segment_window": 512,
  "neg_samp": True,
  "exact": False,

  "log_path": "logs/pairwise_scorer/",
  "data_folder": "data/ecb/mentions",
  "span_repr_path": "models/span_scorers/events_span_repr_0",
  "span_scorer_path": "models/span_scorers/events_span_scorer_0",
  "model_path": "models/pairwise_scorers"
}

with open('/content/coref/configs/config_pairwise_colab.json', "w") as outfile:
    json.dump(config_pairwise_scorer, outfile)

In [None]:
# Need to change name of config file if modified
!python /content/coref/train_pairwise_scorer.py --config /content/coref/configs/config_pairwise_colab.json

100% 50/50 [01:39<00:00,  1.99s/it]
100% 16/16 [00:19<00:00,  1.20s/it]
100% 50/50 [01:31<00:00,  1.83s/it]
100% 16/16 [00:19<00:00,  1.19s/it]
100% 50/50 [01:32<00:00,  1.84s/it]
100% 16/16 [00:19<00:00,  1.21s/it]
100% 50/50 [01:32<00:00,  1.85s/it]
100% 16/16 [00:19<00:00,  1.20s/it]
100% 50/50 [01:31<00:00,  1.84s/it]
100% 16/16 [00:19<00:00,  1.21s/it]
100% 50/50 [01:31<00:00,  1.84s/it]
100% 16/16 [00:19<00:00,  1.22s/it]
100% 50/50 [01:32<00:00,  1.85s/it]
100% 16/16 [00:19<00:00,  1.20s/it]
100% 50/50 [01:32<00:00,  1.85s/it]
100% 16/16 [00:19<00:00,  1.21s/it]
100% 50/50 [01:32<00:00,  1.85s/it]
100% 16/16 [00:19<00:00,  1.21s/it]
100% 50/50 [01:32<00:00,  1.84s/it]
100% 16/16 [00:19<00:00,  1.20s/it]


### Save models

In [None]:
from shutil import copytree

copytree('models', '/content/drive/MyDrive/Coref-for-GPT/Models/Streamline/models')

'/content/drive/MyDrive/AC295-Wiki-ECB-WEC/streamlining/models'

In [None]:
#from shutil import copytree

#copytree('models/pairwise_scorers', '/content/drive/MyDrive/Coref-for-GPT/Models/Streamline/models/pairwise_scorers')

'/content/drive/MyDrive/AC295-Wiki-ECB-WEC/streamlining/models/pairwise_scorers'

# Streamlining annotations


## Load ECB+ processed data

In [7]:
import shutil
shutil.copytree('/content/drive/MyDrive/AC295-Wiki-ECB-WEC/Final Deliverables/Data/processed_ecb/data', 'data')
shutil.copytree('/content/drive/MyDrive/AC295-Wiki-ECB-WEC/Final Deliverables/Coref-for-GPT/Models/Streamline/models', 'models')

'models'

In [None]:
config_clustering = {
  "gpu_num" : [0],

  "bert_model": "roberta-large",
  "hidden_layer": 1024,
  "dropout": 0.3,
  "with_mention_width": True,
  "with_head_attention": True,
  "embedding_dimension": 20,

  "max_mention_span": 10,
  "use_gold_mentions": True,
  "mention_type": "entities",
  "top_k": 0.25,
  "split": "dev",
  "training_method": "continue",
  "subtopic": False,
  "use_predicted_topics": True,
  "segment_window": 512,
  "exact": True,

  "topic_level": False,
  "predicted_topics_path": "/home/nlp/ariecattan/coreference/event_entity_coref_ecb_plus/data/external/document_clustering/predicted_topics",

  "data_folder": "data/ecb/mentions",
  "save_path": "models/pairwise_scorers",
  "model_path" : "models/pairwise_scorers",
  "model_num": 6,
  "keep_singletons": False,

  "threshold": 0.75,
  "linkage_type": "average"
}

import json 

with open('/content/coref/configs/config_clustering_colab.json', "w") as outfile:
    json.dump(config_clustering, outfile)

In [None]:
!python /content/coref/tuned_threshold.py --config /content/coref/configs/config_clustering_colab.json

gpu_num = [
  0
]
bert_model = "roberta-large"
hidden_layer = 1024
dropout = 0.3
with_mention_width = true
with_head_attention = true
embedding_dimension = 20
max_mention_span = 10
use_gold_mentions = true
mention_type = "entities"
top_k = 0.25
split = "dev"
training_method = "continue"
subtopic = false
use_predicted_topics = true
segment_window = 512
exact = true
topic_level = false
predicted_topics_path = "/home/nlp/ariecattan/coreference/event_entity_coref_ecb_plus/data/external/document_clustering/predicted_topics"
data_folder = "data/ecb/mentions"
save_path = "models/pairwise_scorers"
model_path = "models/pairwise_scorers"
model_num = 6
keep_singletons = false
threshold = 0.75
linkage_type = "average"
<corpus.Corpus object at 0x7fde907ee190>
Model 0
Processing topic 35
Processing topic 34
Processing topic 18
Processing topic 21
Processing topic 23
Processing topic 2
Processing topic 5
Processing topic 12
Saving cluster for threshold 0.5
Saving cluster for threshold 0.55
Saving clu

In [None]:
import numpy as np
import pandas as pd
import pickle

topics = [35, 34, 18, 21, 23, 2, 5, 12]

pre_dir = '/content/drive/MyDrive/Coref-for-GPT/Data/ECB+/reproduced/streamlining/'

#topics = [35]
span_dfs = []
pairs_dfs = []

for topic_num, topic in enumerate(topics):
  
    with open('tt_{}_{}.pkl'.format(topic_num, topic), 'rb') as f:
        #mylist = [data.mentions, data.topics_origin_tokens, span_meta_data, span_indices, 
          #first, second, all_scores, doc_id, sentence_id, start, end, labels]
        mylist = pickle.load(f)

    

    with open(pre_dir + 'tt_{}_{}.pkl'.format(topic_num, topic), 'wb') as f:
        pickle.dump(mylist, f)

    span_tokens = np.array(mylist[2:][1])
    cluster_labels = [int(mylist[2:][-1][s]) for s in span_tokens]

    span_info = []

    for i, span_token in enumerate(span_tokens):
        span_info.append(( 
              topic, # topic number
              mylist[2][0][span_token], # document name
              i, # entity token index
              int(mylist[2][1][span_token]), # sentence 3
              int(mylist[2][2][span_token]),   # start token
              int(mylist[2][3][span_token]), # end token
              int(cluster_labels[i])))  # cluster label

    span_dfs.append(pd.DataFrame(span_info, columns=['topic', 'doc_name', 'entity_num', 
                                 'sentence_num', 'start_token', 'end_token',
                                 'cluster_id']))
    

    soft_labels = mylist[-2]
    pairs_info = []

    for a, b, soft in zip(mylist[2:][-3], mylist[2:][-4], soft_labels):

        pairs_info.append((
          topic, # topic number
          int(a), # entity token index for mention 1
          int(b), #entity token index for mention 2
          int(1 * (cluster_labels[int(a)] == cluster_labels[int(b)])), # true label
          float(soft), # soft label
          ))
        
    pairs_dfs.append(pd.DataFrame(pairs_info, columns=['topic', 'entity_num_1', 'entity_num_2', 
                                 'label', 'pred_proba']))

pd.concat(pairs_dfs).reset_index(drop=True).to_csv(pre_dir + 'stream_pairs.csv', index=False)
pd.concat(span_dfs).reset_index(drop=True).to_csv(pre_dir + 'stream_spans.csv', index=False)


In [None]:
!python /content/coref/run_scorer.py /content/data/ecb/gold entities
!git clone https://github.com/ns-moosavi/coval/
shutil.rmtree('/content/coref')
#There's a directory missing in the repo, have to copy it from another repo 
shutil.copytree('/content/coval', '/content/coref/coval')
shutil.copytree('/content/data/ecb/gold_singletons', '/content/data/ecb/gold')

Traceback (most recent call last):
  File "/content/coref/run_scorer.py", line 2, in <module>
    from coval.coval.conll import reader
  File "/content/coref/coval/coval/conll/reader.py", line 2, in <module>
    from coval.conll import mention
ModuleNotFoundError: No module named 'coval.conll'


# Predict & Evaluate

## Load ECB+ processed data

In [None]:
import shutil
shutil.copytree('/content/drive/MyDrive/Coref-for-GPT/Data/processed_ecb/data', 'data')
shutil.copytree('/content/drive/MyDrive/Coref-for-GPT/Models/Streamline/models', 'models')

'models'

### Config

Need to change topic level and use predicted topics, I'm not sure where to find the document he uses

In [None]:
config_clustering = {
  "gpu_num" : [0],

  "bert_model": "roberta-large",
  "hidden_layer": 1024,
  "dropout": 0.3,
  "with_mention_width": True,
  "with_head_attention": True,
  "embedding_dimension": 20,

  "max_mention_span": 10,
  "use_gold_mentions": True,
  "mention_type": "events",
  "top_k": 0.25,
  "split": "test",
  "training_method": "continue",
  "subtopic": True,
  "use_predicted_topics": False,
  "segment_window": 512,
  "exact": False,

  "topic_level": True,
  "predicted_topics_path": "/home/nlp/ariecattan/coreference/event_entity_coref_ecb_plus/data/external/document_clustering/predicted_topics",

  "data_folder": "data/ecb/mentions",
  "save_path": "models/pairwise_scorers",
  "model_path" : "models/pairwise_scorers",
  "model_num": 9,
  "keep_singletons": False,

  "threshold": 0.8,
  "linkage_type": "average"
}

import json 

with open('/content/coref/configs/config_clustering_colab.json', "w") as outfile:
    json.dump(config_clustering, outfile)

### Predict

In [None]:
!python /content/coref/predict.py --config /content/coref/configs/config_clustering_colab.json

gpu_num = [
  0
]
bert_model = "roberta-large"
hidden_layer = 1024
dropout = 0.3
with_mention_width = true
with_head_attention = true
embedding_dimension = 20
max_mention_span = 10
use_gold_mentions = true
mention_type = "events"
top_k = 0.25
split = "test"
training_method = "continue"
subtopic = true
use_predicted_topics = false
segment_window = 512
exact = false
topic_level = true
predicted_topics_path = "/home/nlp/ariecattan/coreference/event_entity_coref_ecb_plus/data/external/document_clustering/predicted_topics"
data_folder = "data/ecb/mentions"
save_path = "models/pairwise_scorers"
model_path = "models/pairwise_scorers"
model_num = 9
keep_singletons = false
threshold = 0.8
linkage_type = "average"
Processing topic 45_1
Processing topic 45_0
Processing topic 42_0
Processing topic 42_1
Processing topic 38_0
Processing topic 38_1
Processing topic 44_1
Processing topic 44_0
Processing topic 43_1
Processing topic 43_0
Processing topic 39_0
Processing topic 39_1
Processing topic 37_1


### Evaluate

In [None]:
!git clone https://github.com/ns-moosavi/coval/

Cloning into 'coval'...
remote: Enumerating objects: 255, done.[K
remote: Total 255 (delta 0), reused 0 (delta 0), pack-reused 255[K
Receiving objects: 100% (255/255), 67.00 KiB | 5.58 MiB/s, done.
Resolving deltas: 100% (153/153), done.


In [None]:
!python /content/coval/scorer.py /content/data/ecb/gold_singletons/test_events_topic_level.conll /content/models/pairwise_scorers/test_events_average_0.8_model_8_topic_level.conll

mentions   Recall: 94.90  Precision: 81.39  F1: 87.63
muc        Recall: 89.03  Precision: 78.55  F1: 83.46
bcub       Recall: 79.93  Precision: 64.82  F1: 71.59
ceafe      Recall: 69.36  Precision: 51.74  F1: 59.27
lea        Recall: 77.57  Precision: 62.17  F1: 69.02
CoNLL score: 71.44


In [None]:
from sklearn.cluster import AgglomerativeClustering
import argparse
import pyhocon
from transformers import AutoTokenizer, AutoModel
from itertools import product
import collections
from tqdm import tqdm

from conll import write_output_file
from models import SpanScorer, SimplePairWiseClassifier, SpanEmbedder
from utils import *
from model_utils import *



def init_models(config, device, model_num):
    span_repr = SpanEmbedder(config, device).to(device)
    span_repr.load_state_dict(torch.load(os.path.join(config['model_path'],
                                                      "span_repr_{}".format(model_num)),
                                         map_location=device))
    span_repr.eval()
    span_scorer = SpanScorer(config).to(device)
    span_scorer.load_state_dict(torch.load(os.path.join(config['model_path'],
                                                        "span_scorer_{}".format(model_num)),
                                           map_location=device))
    span_scorer.eval()
    pairwise_scorer = SimplePairWiseClassifier(config).to(device)
    pairwise_scorer.load_state_dict(torch.load(os.path.join(config['model_path'],
                                                           "pairwise_scorer_{}".format(model_num)),
                                              map_location=device))
    pairwise_scorer.eval()

    return span_repr, span_scorer, pairwise_scorer




def is_included(docs, starts, ends, i1, i2):
    doc1, start1, end1 = docs[i1], starts[i1], ends[i1]
    doc2, start2, end2 = docs[i2], starts[i2], ends[i2]

    if doc1 == doc2 and (start1 >= start2 and end1 <= end2):
        return True
    return False


def remove_nested_mentions(cluster_ids, doc_ids, starts, ends):
    # nested_mentions = collections.defaultdict(list)
    # for i, x in range(len(cluster_ids)):
    #     nested_mentions[x].append(i)

    doc_ids = np.asarray(doc_ids)
    starts = np.asarray(starts)
    ends = np.asarray(ends)

    new_cluster_ids, new_docs_ids, new_starts, new_ends = [], [], [], []

    for cluster, idx in cluster_ids.items():
        docs = doc_ids[idx]
        start = starts[idx]
        end = ends[idx]


        for i in range(len(idx)):
            indicator = [is_included(docs, start, end, i, j) for j in range(len(idx))]
            if sum(indicator) > 1:
                continue

            new_cluster_ids.append(cluster)
            new_docs_ids.append(docs[i])
            new_starts.append(start[i])
            new_ends.append(end[i])


    clusters = collections.defaultdict(list)
    for i, cluster_id in enumerate(new_cluster_ids):
        clusters[cluster_id].append(i)

    return clusters, new_docs_ids, new_starts, new_ends

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--config', type=str, default='configs/config_clustering.json')
    args = parser.parse_args()


    config = pyhocon.ConfigFactory.parse_file(args.config)
    print(pyhocon.HOCONConverter.convert(config, "hocon"))
    create_folder(config['save_path'])
    device = 'cuda:{}'.format(config['gpu_num'][0]) if torch.cuda.is_available() else 'cpu'


    # Load models and init clustering
    bert_model = AutoModel.from_pretrained(config['bert_model']).to(device)
    config['bert_hidden_size'] = bert_model.config.hidden_size



    bert_tokenizer = AutoTokenizer.from_pretrained(config['bert_model'])
    data = create_corpus(config, bert_tokenizer, 'dev')
    print(data)

    clustering_5 = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage=config['linkage_type'],
                                         distance_threshold=0.5)
    clustering_55 = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage=config['linkage_type'],
                                         distance_threshold=0.55)
    clustering_6 = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage=config['linkage_type'],
                                         distance_threshold=0.6)
    clustering_65 = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage=config['linkage_type'],
                                         distance_threshold=0.65)
    clustering_7 = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage=config['linkage_type'],
                                         distance_threshold=0.7)

    clustering = []

    for x in [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8]:
        agglo = AgglomerativeClustering(n_clusters=None, affinity='precomputed', linkage=config['linkage_type'],
                                         distance_threshold=x)

        clustering.append(agglo)


    for num in range(10):
        print('Model {}'.format(num))
        span_repr, span_scorer, pairwise_scorer = init_models(config, device, num)

        clusters = [list(), list(), list(), list(), list(), list(), list()]
        max_ids = [0, 0, 0, 0, 0, 0, 0]
        threshold = {id: thresh for id, thresh in enumerate([0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8])}

        doc_ids, sentence_ids, starts, ends = [], [], [], []

        for topic_num, topic in enumerate(data.topic_list):
            print('Processing topic {}'.format(topic))
            docs_embeddings, docs_length = pad_and_read_bert(data.topics_bert_tokens[topic_num], bert_model)
            span_meta_data, span_embeddings, num_of_tokens = get_all_candidate_from_topic(
                config, data, topic_num, docs_embeddings, docs_length)

            doc_id, sentence_id, start, end = span_meta_data
            start_end_embeddings, continuous_embeddings, width = span_embeddings
            width = width.to(device)

            labels = data.get_candidate_labels(doc_id, start, end)

            if config['use_gold_mentions']:
                span_indices = torch.nonzero(labels).squeeze(1)
            else:
                with torch.no_grad():
                    span_emb = span_repr(start_end_embeddings, continuous_embeddings, width)
                    span_scores = span_scorer(span_emb)

                if config.exact:
                    span_indices = torch.where(span_scores > 0.5)[0]
                else:
                    k = int(config['top_k'] * num_of_tokens)
                    _, span_indices = torch.topk(span_scores.squeeze(1), k, sorted=False)
                # span_indices, _ = torch.sort(span_indices)

            number_of_mentions = len(span_indices)
            start_end_embeddings = start_end_embeddings[span_indices]
            continuous_embeddings = [continuous_embeddings[i] for i in span_indices]
            width = width[span_indices]
            torch.cuda.empty_cache()

            # Prepare all the pairs for the distance matrix
            first, second = zip(*list(product(range(len(span_indices)), repeat=2)))
            first = torch.tensor(first)
            second = torch.tensor(second)

            torch.cuda.empty_cache()
            all_scores = []
            with torch.no_grad():
                for i in range(0, len(first), 10000):
                    # end_max = min(i+100000, len(first))
                    end_max = i + 10000
                    first_idx, second_idx = first[i:end_max], second[i:end_max]
                    g1 = span_repr(start_end_embeddings[first_idx],
                                   [continuous_embeddings[k] for k in first_idx],
                                   width[first_idx])
                    g2 = span_repr(start_end_embeddings[second_idx],
                                   [continuous_embeddings[k] for k in second_idx],
                                   width[second_idx])
                    scores = pairwise_scorer(g1, g2)

                    torch.cuda.empty_cache()
                    if config['training_method'] in ('continue', 'e2e') and not config['use_gold_mentions']:
                        g1_score = span_scorer(g1)
                        g2_score = span_scorer(g2)
                        scores += g1_score + g2_score

                    scores = torch.sigmoid(scores)
                    all_scores.extend(scores.detach().cpu().squeeze(1))
                    torch.cuda.empty_cache()

            all_scores = torch.stack(all_scores)


            import pickle
            mylist = [data.mentions, data.topics_origin_tokens, span_meta_data, span_indices, first, second, all_scores, labels]
            with open('tt_{}_{}.pkl'.format(topic_num, topic), 'wb') as f:
               pickle.dump(mylist, f)

            # Affinity score to distance score
            pairwise_distances = 1 - all_scores.view(number_of_mentions, number_of_mentions).numpy()

            doc_ids.extend(doc_id[span_indices.cpu()])
            sentence_ids.extend(sentence_id[span_indices].tolist())
            starts.extend(start[span_indices].tolist())
            ends.extend(end[span_indices].tolist())


            for i, agglomerative in enumerate(clustering):
                predicted = agglomerative.fit(pairwise_distances)
                predicted_clusters = predicted.labels_ + max_ids[i]
                max_ids[i] = max(predicted_clusters) + 1
                clusters[i].extend(predicted_clusters)


        for i, predicted in enumerate(clusters):
            print('Saving cluster for threshold {}'.format(threshold[i]))
            all_clusters = collections.defaultdict(list)
            for span_id, cluster_id in enumerate(predicted):
                all_clusters[cluster_id].append(span_id)

            if not config['use_gold_mentions']:
                all_clusters, new_doc_ids, new_starts, new_ends = remove_nested_mentions(all_clusters, doc_ids, starts, ends)
            else:
                new_doc_ids, new_starts, new_ends = doc_ids, starts, ends

            # removing singletons
            all_clusters = {cluster_id:mentions for cluster_id, mentions in all_clusters.items()
                               if len(mentions) > 1}

            # print('Saving conll file...')
            doc_name = 'dev_{}_model_{}_{}_{}'.format(
                config['mention_type'], num, config['linkage_type'], threshold[i])

            write_output_file(data.documents, all_clusters, new_doc_ids, new_starts, new_ends, config['save_path'], doc_name,
                              topic_level=config.topic_level, corpus_level=not config.topic_level)



In [52]:
import pandas as pd
import json

pre_dir = '/content/drive/MyDrive/Coref-for-GPT/Data/ECB+/'
dir_path = '/content/drive/MyDrive/Coref-for-GPT/Data/ECB+/'
pre_dir = dir_path + 'reproduced/streamlining/'

pairs = pd.read_csv(pre_dir + 'stream_pairs.csv').drop(columns=['Unnamed: 0'])
spans = pd.read_csv(pre_dir + 'stream_spans.csv').drop(columns=['Unnamed: 0'])

spans['key'] = spans.topic.astype(str) + '_' + spans.entity_num.astype(str)
pairs['key1'] = pairs.topic.astype(str) + '_' + pairs.entity_num_1.astype(str)
pairs['key2'] = pairs.topic.astype(str) + '_' + pairs.entity_num_2.astype(str)

pairs['doc_name_1'] = pairs.key1.map(spans.set_index('key')['doc_name'])
pairs['doc_name_2'] = pairs.key2.map(spans.set_index('key')['doc_name'])

pairs['sentence_num_1'] = pairs.key1.map(spans.set_index('key')['sentence_num'])
pairs['sentence_num_2'] = pairs.key2.map(spans.set_index('key')['sentence_num'])

df = pairs[(pairs.doc_name_1 == pairs.doc_name_2) & 
           (pairs.entity_num_1 < pairs.entity_num_2) & 
           ( pairs.sentence_num_2 - pairs.sentence_num_1 <= 1)].reset_index(drop=True)

In [39]:
sentences = {}

for key in js.keys():

    sentences[key] = js[key][0].split('[EOS]')

In [53]:
js = json.load(open(dir_path + 'reproduced/gold/dev.json', 'rb'))

all_tokens = []
for doc in js.keys():
    for tokens in js[doc][1]:
      all_tokens.append((doc, tokens['sentence_id'], 
                         min(tokens['tokens_ids']), 
                         max(tokens['tokens_ids']), 
                         tokens['tokens']))
      
all_mentions = pd.DataFrame(all_tokens, columns=['doc_name', 'sentence_num', 'start_token', 'end_token', 'mention'])
merged_spans = spans.merge(all_mentions, on=['doc_name', 'sentence_num', 'start_token', 'end_token'])

In [54]:
df['mention_1'] = df.key1.map(merged_spans.set_index('key').mention)
df['mention_2'] = df.key2.map(merged_spans.set_index('key').mention)
df = df.dropna().reset_index(drop=True)

In [55]:
df.head()

Unnamed: 0,topic,entity_num_1,entity_num_2,label,pred_proba,key1,key2,doc_name_1,doc_name_2,sentence_num_1,sentence_num_2,mention_1,mention_2
0,35,0,1,0,0.003513,35_0,35_1,35_10ecb.xml,35_10ecb.xml,0,1,Jackson,earlier this week
1,35,0,2,1,0.049732,35_0,35_2,35_10ecb.xml,35_10ecb.xml,0,1,Jackson,he
2,35,1,2,0,0.001363,35_1,35_2,35_10ecb.xml,35_10ecb.xml,1,1,earlier this week,he
3,35,0,3,0,0.029616,35_0,35_3,35_10ecb.xml,35_10ecb.xml,0,1,Jackson,Chargers
4,35,1,3,0,0.002365,35_1,35_3,35_10ecb.xml,35_10ecb.xml,1,1,earlier this week,Chargers


In [56]:
df['sentence_text'] = df.apply(lambda x:[sentences[x.doc_name_1][x.sentence_num_1],
                                           sentences[x.doc_name_2][x.sentence_num_2]], axis=1) 
#df['sentence_text_2'] = df.apply(lambda x:sentences[x.doc_name_2][x.sentence_num_2], axis=1)

In [57]:
df['pred'] = 1 * (df.pred_proba >= 0.5)

In [60]:
df.sentence_text[0]

["Chargers' Jackson arrested on suspicion of DUI ",
 " But after being arrested for suspicion DUI earlier this week, he has bigger worries than his role in the Chargers' offense. "]

In [61]:
df.to_csv('/content/drive/MyDrive/Coref-for-GPT/Results/Streamlining/pairs_with_mentions.csv', index=False)

In [18]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [47]:
accuracy_score(df.label, df.pred)

0.9352568613652358

In [48]:
precision_score(df.label, df.pred)

0.8714285714285714

In [49]:
recall_score(df.label, df.pred)

0.18597560975609756

In [50]:
f1_score(df.label, df.pred)

0.3065326633165829