In [3]:
import json

with open('data/cleaned_economics_data.json', 'r') as f:
    articles = json.load(f)

In [20]:
len(articles['data'])

1186

## Data Crawling

In [None]:
from newsplease import NewsPlease

def crawl_data(url):
    article = NewsPlease.from_url(url)
    return article

In [5]:
from newsplease import NewsPlease
url = 'https://www.theguardian.com/environment/2025/nov/29/climate-crisis-depleting-europe-groundwater-reserves-analysis'
article = NewsPlease.from_url(url)
# print(article.title)

In [10]:
article.title

'Revealed: Europe’s water reserves drying up due to climate breakdown'

In [14]:
len(article.maintext.split())

874

In [11]:
import json

with open("article.json", "w") as file:
    json.dump(article.get_serializable_dict(), file)

## Data Cleaning

In [None]:
def clean_article(article):
    cleaned_article = {
        'title': article.title,
        'authors': article.authors,
        'date_publish': article.date_publish,
        'maintext': article.maintext,
        'url': article.url
    }
    return cleaned_article

# for i, item in enumerate(articles['data']):
#     articles['data'][i] = clean_article(item)

In [23]:
import json

with open('data/cleaned_economics_data.json', 'r') as f:
    articles = json.load(f)

## Quotation Extraction

In [4]:
from src.cofenet.model.mod_bert import ModelBert_Cofe
import torch

model_cofe = ModelBert_Cofe()
model_name = "./src/cofenet/checkpoint/model_6000.bin"
state_dict = torch.load(model_name, map_location=torch.device('cpu'))
model_cofe.load_state_dict(state_dict)

  from .autonotebook import tqdm as notebook_tqdm


<All keys matched successfully>

In [24]:
import torch
from src.cofenet.utils.utils import *
from src.cofenet.utils.loader import SingleDataLoader
from src.cofenet.utils.dataset import DatasetBert
from torch.utils.data import SequentialSampler

In [25]:
def tgidss2tgstrss(tgidss, tags_file_path ,lengths=None):
    tgstrss = []
    map_tg2tgid = {tag: idx for idx, tag in enumerate(load_text_file_by_line(tags_file_path))}
    map_tgid2tg = {idx: tag for tag, idx in map_tg2tgid.items()}
    
    if lengths is None:
        for tgids in tgidss:
            tgstrss.append([map_tgid2tg[tgid] for tgid in tgids])
    else:
        for tgids, length in zip(tgidss, lengths):
            tgstrss.append([map_tgid2tg[tgid] for tgid in tgids[:length]])
    return tgstrss


In [26]:
import os
import json
def read_data(file_path):
    if not os.path.exists(file_path):
        raise Exception('data file_path is not exist')

    with open(file_path, 'r', encoding='utf-8') as file_object:  
        data = json.load(file_object)
    return data


In [27]:
import re
import uuid
def doc_text_preprocessing(doc):
    global split_pargraphs
    
    def clean_text(txt):
        txt = re.sub(r'(\“|\”)', "\"", txt)
        txt = re.sub(r'[^a-zA-Z0-9 \.\'\"\,\-\(\)\’\$\#\@]', "", txt)
        txt = re.sub(r'(\( )', "(", txt)
        txt = re.sub(r'( \))', ")", txt)
        txt = re.sub(r'( \.)', ".", txt)
        txt = re.sub(r'( \,)', ",", txt)
        txt = re.sub(r'(.)\.(.)', r'\1. \2', txt)
        txt = re.sub(r' +', " ", txt)
        txt = re.sub(r'([a-z])\.([a-z])', r'\1 \2', txt)
        return txt.strip()

    # clean text
    doc['maintext'] = clean_text(doc['maintext'])

    # check if there is no paragraphs to start split maintext
    if len(doc['paragraphs']) == 0:
       doc['paragraphs'] = split_pargraphs(doc['maintext'])

    doc['paragraphs']  = list(filter(lambda x: len(x) > 1, map(lambda txt: clean_text(txt), doc['paragraphs'])))

    # identify doc with id
    doc['ID'] = str(uuid.uuid3(uuid.NAMESPACE_URL, doc['url']))
    return doc



In [28]:
import traceback

def handel_error(fun, doc):
    try:
        return fun(doc)
    except Exception as e:
        traceback.print_exc()
        return None


data = read_data('./data/cleaned_economics_data.json')['data']
docs = map(lambda doc: handel_error(doc_text_preprocessing, doc), data)
docs = list(filter(lambda x: x != None, docs))

In [29]:
len(docs)

1186

In [30]:
import torch
from src.cofenet.utils.utils import *
from src.cofenet.utils.loader import SingleDataLoader
from src.cofenet.utils.dataset import DatasetBert
from torch.utils.data import SequentialSampler


def extract_quotes(infer_str:list, model_cofe) -> list:

    file_path = read_write_str(infer_str, "./src/cofenet/infer_file.txt")
    dataset = DatasetBert(file_path)
    tag_file_path = './src/cofenet/utils/tag.txt'

    dataloder = SingleDataLoader(dataset=dataset,
                                batch_size=32,
                                sampler=SequentialSampler(dataset),
                                collate_fn=dataset.collate)
    preds = []
    for batch_data in dataloder:
        model_cofe.eval()
        with torch.no_grad():
            batch_preds = model_cofe.predict(batch_data)
            
            batch_pred_strs = tgidss2tgstrss(
                batch_preds.data.cpu().numpy() if not isinstance(batch_preds, list) else batch_preds, tag_file_path,
                batch_data['lengths'].cpu().numpy())

            preds.extend(batch_pred_strs)

    os.remove("./src/cofenet/infer_file.txt")
    return preds




In [31]:
def quote_cue_source_extraction(doc):

    def clean_text(txt):
        txt = txt.lower()
        # txt = re.sub(r'[^a-zA-Z0-9 \.\$\#\@]', "", txt)
        # txt = re.sub(r' +', " ", txt)
        # txt = re.sub(r'(\. |\.$)', " ", txt)
        # txt = re.sub(r'([a-z])\.([a-z])', r'\1 \2', txt)
        return txt.strip()

    paragraphs = doc['paragraphs']

    # extract Cue, Source, and Quotes
    predict_entities = extract_quotes(paragraphs, model_cofe)
    return doc, predict_entities

In [33]:
ents = []
for doc in docs:
    try:
        doc, predict_entities = quote_cue_source_extraction(doc)
        ents.append(predict_entities)
    except Exception as e:
        print(f"Error processing document {doc['title']}: {e}")
        ents.append([])

100%|██████████| 13/13 [00:00<00:00, 277.02it/s]
100%|██████████| 12/12 [00:00<00:00, 192.17it/s]
100%|██████████| 7/7 [00:00<00:00, 234.77it/s]
100%|██████████| 10/10 [00:00<00:00, 193.48it/s]
100%|██████████| 10/10 [00:00<00:00, 208.66it/s]


Error processing document Ethiopian forces have recaptured key towns on the road to Tigray: The expanded size of the tensor (518) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [10, 518].  Tensor sizes: [1, 512]


100%|██████████| 8/8 [00:00<00:00, 253.74it/s]
100%|██████████| 16/16 [00:00<00:00, 328.13it/s]
100%|██████████| 8/8 [00:00<00:00, 122.32it/s]


Error processing document Congo’s president has not kept his word: The expanded size of the tensor (672) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [8, 672].  Tensor sizes: [1, 512]


100%|██████████| 7/7 [00:00<00:00, 180.86it/s]
100%|██████████| 5/5 [00:00<00:00, 221.54it/s]
100%|██████████| 6/6 [00:00<00:00, 176.07it/s]
100%|██████████| 10/10 [00:00<00:00, 227.83it/s]
100%|██████████| 14/14 [00:00<00:00, 318.25it/s]
100%|██████████| 9/9 [00:00<00:00, 164.25it/s]
100%|██████████| 15/15 [00:00<00:00, 216.48it/s]
100%|██████████| 4/4 [00:00<00:00, 183.05it/s]
100%|██████████| 9/9 [00:00<00:00, 238.83it/s]
100%|██████████| 8/8 [00:00<00:00, 330.62it/s]
100%|██████████| 8/8 [00:00<00:00, 258.66it/s]
100%|██████████| 9/9 [00:00<00:00, 273.20it/s]
100%|██████████| 8/8 [00:00<00:00, 280.59it/s]
100%|██████████| 8/8 [00:00<00:00, 257.06it/s]
100%|██████████| 12/12 [00:00<00:00, 311.65it/s]
100%|██████████| 13/13 [00:00<00:00, 191.43it/s]
100%|██████████| 9/9 [00:00<00:00, 234.81it/s]
100%|██████████| 6/6 [00:00<00:00, 301.72it/s]
100%|██████████| 6/6 [00:00<00:00, 140.01it/s]
100%|██████████| 8/8 [00:00<00:00, 198.55it/s]
100%|██████████| 13/13 [00:00<00:00, 234.74it/s]
1

Error processing document Sudan’s democratic transition is upended by a second coup in two years: The expanded size of the tensor (532) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [8, 532].  Tensor sizes: [1, 512]


100%|██████████| 12/12 [00:00<00:00, 334.98it/s]
100%|██████████| 9/9 [00:00<00:00, 245.65it/s]
100%|██████████| 6/6 [00:00<00:00, 255.39it/s]
100%|██████████| 34/34 [00:00<00:00, 260.07it/s]


Error processing document How kidnappers, zealots and rebels are making Nigeria ungovernable: The expanded size of the tensor (527) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [32, 527].  Tensor sizes: [1, 512]


100%|██████████| 6/6 [00:00<00:00, 259.49it/s]
100%|██████████| 8/8 [00:00<00:00, 305.94it/s]
100%|██████████| 11/11 [00:00<00:00, 233.90it/s]
100%|██████████| 9/9 [00:00<00:00, 224.24it/s]
100%|██████████| 10/10 [00:00<00:00, 243.76it/s]
100%|██████████| 16/16 [00:00<00:00, 236.99it/s]
100%|██████████| 10/10 [00:00<00:00, 208.31it/s]
100%|██████████| 5/5 [00:00<00:00, 200.79it/s]
100%|██████████| 7/7 [00:00<00:00, 185.55it/s]
100%|██████████| 9/9 [00:00<00:00, 289.34it/s]
100%|██████████| 9/9 [00:00<00:00, 237.82it/s]
100%|██████████| 11/11 [00:00<00:00, 239.34it/s]
100%|██████████| 7/7 [00:00<00:00, 196.14it/s]
100%|██████████| 8/8 [00:00<00:00, 217.58it/s]
100%|██████████| 9/9 [00:00<00:00, 175.82it/s]
100%|██████████| 9/9 [00:00<00:00, 264.55it/s]
100%|██████████| 11/11 [00:00<00:00, 225.73it/s]
100%|██████████| 10/10 [00:00<00:00, 286.34it/s]
100%|██████████| 13/13 [00:00<00:00, 247.49it/s]
100%|██████████| 4/4 [00:00<00:00, 234.67it/s]
100%|██████████| 12/12 [00:00<00:00, 232.45i

Error processing document Alpha Condé, the president of Guinea, is ousted in a coup: The expanded size of the tensor (540) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [10, 540].  Tensor sizes: [1, 512]


100%|██████████| 10/10 [00:00<00:00, 328.94it/s]
100%|██████████| 15/15 [00:00<00:00, 263.47it/s]
100%|██████████| 11/11 [00:00<00:00, 256.04it/s]
100%|██████████| 22/22 [00:00<00:00, 290.52it/s]
100%|██████████| 8/8 [00:00<00:00, 209.36it/s]
100%|██████████| 8/8 [00:00<00:00, 179.52it/s]
100%|██████████| 7/7 [00:00<00:00, 209.34it/s]
100%|██████████| 14/14 [00:00<00:00, 268.89it/s]
100%|██████████| 8/8 [00:00<00:00, 248.82it/s]
100%|██████████| 11/11 [00:00<00:00, 300.70it/s]
100%|██████████| 6/6 [00:00<00:00, 245.04it/s]
100%|██████████| 11/11 [00:00<00:00, 321.70it/s]
100%|██████████| 13/13 [00:00<00:00, 257.13it/s]
100%|██████████| 9/9 [00:00<00:00, 238.56it/s]
100%|██████████| 12/12 [00:00<00:00, 345.20it/s]
100%|██████████| 11/11 [00:00<00:00, 198.95it/s]
100%|██████████| 8/8 [00:00<00:00, 229.93it/s]
100%|██████████| 6/6 [00:00<00:00, 181.34it/s]
100%|██████████| 12/12 [00:00<00:00, 225.19it/s]
100%|██████████| 9/9 [00:00<00:00, 136.18it/s]
100%|██████████| 18/18 [00:00<00:00, 2

Error processing document Africa’s latest wave of covid-19 could be its worst yet: The expanded size of the tensor (519) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [12, 519].  Tensor sizes: [1, 512]


100%|██████████| 10/10 [00:00<00:00, 240.93it/s]
100%|██████████| 13/13 [00:00<00:00, 187.01it/s]
100%|██████████| 11/11 [00:00<00:00, 281.86it/s]
100%|██████████| 4/4 [00:00<00:00, 421.87it/s]
100%|██████████| 8/8 [00:00<00:00, 326.14it/s]
100%|██████████| 7/7 [00:00<00:00, 318.74it/s]
100%|██████████| 12/12 [00:00<00:00, 189.94it/s]


KeyboardInterrupt: 

In [22]:
len(predict_entities)

14

In [None]:
predict_entities

In [20]:
len(doc['paragraphs'])

14

In [27]:
len(doc['paragraphs'][0].split())

116

In [26]:
doc['paragraphs'][0]

'I n the central marketplace of Gedaref, eastern Sudan, Mohammed Siddig counts the cost of the past year’s turmoil. The price of fuel, which he needs to run his farm near the border with Ethiopia, is up by about 300. School fees, which he pays for four of his children, have increased by 400. Yet just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut their subsidies. "It’s totally unprofitable," Mr Siddig laments. His sesame and sorghum harvest recently fetched about half what it had the previous year. Now he is in debt, which he underscores by slapping onto the counter a bag of chickpeas that he is buying on credit.'

In [25]:
predict_entities[0]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-source',
 'I-source',
 'B-cue',
 'B-content',
 'I-content',
 'I-content',
 'I-content',
 'I-content',
 'I-content',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-content',
 'I-content',
 'I-content',
 'B-source',
 'I-source',
 'B-cue',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [24]:
len(predict_entities[0])

116

In [17]:
import pandas as pd
columns = ['Token', 'labels']
data = []
for token, label in zip(doc['paragraphs'][0].split(), predict_entities[0]):
    data.append([token, label])
df = pd.DataFrame(data, columns=columns)
df.head(20)


Unnamed: 0,Token,labels
0,I,O
1,n,O
2,the,O
3,central,O
4,marketplace,O
5,of,O
6,"Gedaref,",O
7,eastern,O
8,"Sudan,",O
9,Mohammed,B-source


In [38]:
from spacy.tokens import Doc, Span
import spacy
from typing import List
from fastcoref import spacy_component

nlp = spacy.load("en_core_web_trf")
nlp.add_pipe("fastcoref")

def coref_resolver(txt, bio):
    global nlp

    def get_span_noun_indices(doc: Doc, cluster: List[List[int]]) -> List[int]:
        spans = [doc[span[0]:span[1]+1] for span in cluster]
        spans_pos = [[token.pos_ for token in span] for span in spans]
        span_noun_indices = [i for i, span_pos in enumerate(spans_pos)
            if any(pos in span_pos for pos in ['NOUN', 'PROPN'])]
        return span_noun_indices

    def get_cluster_head(doc: Doc, cluster: List[List[int]], noun_indices: List[int]):
        head_idx = noun_indices[0]
        head_start, head_end = cluster[head_idx]
        head_span = doc[head_start:head_end+1]
        return head_span, [head_start, head_end]

    def is_containing_other_spans(span: List[int], all_spans: List[List[int]]):
        return any([s[0] >= span[0] and s[1] <= span[1] and s != span for s in all_spans])

    def improved_replace_corefs(document, clusters):
        all_spans = [span for cluster in clusters for span in cluster]
        coref_results = []
        for cluster in clusters:
            noun_indices = get_span_noun_indices(document, cluster)
            if noun_indices:
                mention_span, mention = get_cluster_head(document, cluster, noun_indices)
                for coref in cluster:
                    if coref != mention and not is_containing_other_spans(coref, all_spans):
                        coref_results.append({"coref_pos": coref, "refer": mention_span.text})
        return coref_results

    try:
        doc = nlp(txt)
        coref_clusters = doc._.coref_clusters
        clusters = []
        # import ipdb; ipdb.set_trace()
        for cluster in coref_clusters:
            spans = []
            
            for mention in cluster:
                start = mention[0]
                end = mention[1] - 1  # match AllenNLP indexing
                spans.append([start, end])
            clusters.append(spans)
    except Exception as e:
        print("*"*20)
        print(f"Error processing: {txt}")
        print(e)
        print("*"*20)
        return []

    # build spaCy doc with BIO entities
    words = txt.split(' ')
    spaces = [True] * len(words)
    doc_ = Doc(nlp.vocab, words=words, spaces=spaces, ents=bio)
    doc = nlp(doc_)
    coref_results = improved_replace_corefs(doc, clusters)
    return coref_results


11/27/2025 23:07:57 - INFO - 	 missing_keys: []
11/27/2025 23:07:57 - INFO - 	 unexpected_keys: []
11/27/2025 23:07:57 - INFO - 	 mismatched_keys: []
11/27/2025 23:07:57 - INFO - 	 error_msgs: []
11/27/2025 23:07:57 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M


In [39]:
coref_results = coref_resolver(doc['paragraphs'][0], predict_entities[0])

11/27/2025 23:08:06 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 112.80 examples/s]
11/27/2025 23:08:11 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00,  2.85it/s]
11/27/2025 23:08:12 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 137.14 examples/s]
11/27/2025 23:08:17 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00,  5.47it/s]


In [40]:
coref_results

[{'coref_pos': [140, 141],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [156, 158],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [235, 236],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [255, 257],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [439, 447],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [458, 460],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [552, 553],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [573, 574],


In [None]:
coref_results[0]['refer']

In [None]:
# def get_polarity(texts: list):
#     global polarity_model
#     return polarity_model(texts)



def enhance_source(span, enhance_if_large_than=4):
    global nlp
    position = [span.start, span.end]
    doc = nlp(Doc(nlp.vocab, words=span.text.split(' ')))
    if len(doc) <= enhance_if_large_than: 
        return span.text, position

    # get first person entity
    ent = list(filter(lambda x: x.label_ == 'PERSON', doc.ents))[0]

    left = []
    for i in range(1,6):
        tok = doc[ent.start-i]
        if ent.start-i < 0 or tok.pos_ not in {"PROPN", "PRON"}: break
        left.append(tok.text)

    right = []
    for i in range(1,6):
        if ent.start+i > len(doc)-1 or tok.pos_ not in {"PROPN", "PRON"}: break
        tok = doc[ent.start+i]
        right.append(tok.text)


    local_pos = [ent.start - len(left), ent.end + len(right)]
    position[0] = position[0] + local_pos[0]
    position[1] = position[1] - (len(doc) - local_pos[1])
    return doc[local_pos[0]: local_pos[1]].text.strip(), position



def entity_linking(paragraphs, bios):
    global nlp

    linked_entities = []
    for i, content in enumerate(zip(paragraphs, bios)):
        local_linked_entities = []
        # decompress the tuple
        paragraph, bio = content

        # the previous pargraph preparation
        add_words, add_bio = [], []
        # in case of first paragraph will ignore it, becouse there is no pargraphs before first one.
        if i != 0:
            # split to words
            add_words = paragraphs[i-1].split(' ')
            add_bio = bios[i-1]

        # split to words and combine the previous paragraph with the current one.
        words = add_words + paragraph.split(' ')
        # prepare the spaces 
        spaces = [True]*len(words)
        # combine the previous bio with the current one.
        bio_ = add_bio + bio 

        # create Doc with its entities
        doc_ = Doc(nlp.vocab, words=words, spaces=spaces, ents=bio_)
        # feed the doc to default spacy pipeline to get the dependency tree and POS tags
        doc = nlp(doc_)

        # assigne the new doc ents with our entites.
        doc.ents = doc_.ents

        cues = list(filter(lambda ent: ent.label_ == 'cue', doc.ents)) # get list of cue-verb entities
        sources = list(filter(lambda ent: ent.label_ == 'source', doc.ents)) # get list of source entities
        contents = list(filter(lambda ent: ent.label_ == 'content', doc.ents)) # get list of content entities

        # loop on each cue
        for cue in cues:
            # get only the verb word from cue, becaues cue and has many words
            verb = None
            verbs = list(filter(lambda tok: tok.pos_ =='VERB', cue))
            if len(verbs) > 0:
                verb = verbs[0] # get the first one

            # get the source of cue based on POS & dependency tree
            try:
                source = None
                # check all verb's children, if any one of them is labeled as a source entity.
                source_part = next((child for child in verb.children if child.ent_type_ == 'source'), None)
                # in case if no verb's children exist as source entity, look at the head "Conj" 
                if source_part == None:
                    temp_verb = verb
                    # loop untill get the source
                    out_ = 0
                    while temp_verb.dep_ != 'ROOT' or temp_verb.pos_ != 'VERB':
                        temp_verb = temp_verb.head
                        source_part = next((child for child in temp_verb.children if child.ent_type_ == 'source'), None)
                        if out_ > 5: break
                        out_+=1
                
                # get the original entity of source_part 
                for ent in sources:
                    if source_part.i >= ent.start and source_part.i <= ent.end:
                        source = ent
                        break
            except:
                pass

            try:
                quote = None
                # check all verb's children, if any one of them is labeled as a content entity.
                qoute_part = next((child for child in verb.children if child.ent_type_ == 'content'), None)
                if qoute_part == None and verb.head.ent_type_ == 'content':
                    qoute_part = verb.head

                # get the original entity of qoute_part 
                for ent in contents:
                    if qoute_part.i >= ent.start and qoute_part.i <= ent.end:
                        quote = ent
                        break
            except:
                pass

            try:
                # to get only quotes from the current paragraph, and not include quotes from the previous one.
                if quote[0].i >= len(add_words):
                    enhanced_source, position = enhance_source(source)
                    obj = {"Speaker": enhanced_source,
                           "Speaker_position": position, # useing in corfrence resolution 
                           "Cue": verb.text,
                           "Quote": quote.text,
                        #    "Quote_polarity": get_polarity([quote.text])[0],
                        #    "Quote_summarization": summerizer_model.predict(quote.text)[0] if len(quote.text.split(' ')) > 20 else quote.text
                    }
                    local_linked_entities.append(obj)
            except:
                pass
        
        linked_entities.append(local_linked_entities)
    return linked_entities



In [82]:
paragraphs = [doc['paragraphs'][0]] 
linked_entities = entity_linking(paragraphs, [predict_entities[0]])
linked_entities

11/27/2025 23:40:49 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 112.90 examples/s]
11/27/2025 23:40:54 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00,  7.85it/s]
11/27/2025 23:40:54 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 59.10 examples/s]
11/27/2025 23:40:58 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00,  8.48it/s]


[[{'Speaker': 'Mohammed Siddig',
   'Speaker_position': [9, 11],
   'Cue': 'counts',
   'Quote': 'the cost of the past year’s'}]]

In [77]:
linked_entities[0][0].keys()

dict_keys(['Speaker', 'Speaker_position', 'Cue', 'Quote'])

In [47]:
doc['paragraphs'][0]

'I n the central marketplace of Gedaref, eastern Sudan, Mohammed Siddig counts the cost of the past year’s turmoil. The price of fuel, which he needs to run his farm near the border with Ethiopia, is up by about 300. School fees, which he pays for four of his children, have increased by 400. Yet just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut their subsidies. "It’s totally unprofitable," Mr Siddig laments. His sesame and sorghum harvest recently fetched about half what it had the previous year. Now he is in debt, which he underscores by slapping onto the counter a bag of chickpeas that he is buying on credit.'

In [46]:
linked_entities[0][0]

{'Speaker': 'Mohammed Siddig',
 'Speaker_position': [9, 11],
 'Cue': 'counts',
 'Quote': 'the cost of the past year’s'}

In [50]:
coref_results

[{'coref_pos': [140, 141],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [156, 158],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [235, 236],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [255, 257],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [439, 447],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [458, 460],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [552, 553],
  'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'},
 {'coref_pos': [573, 574],


In [None]:
def apply_coref_on_linked_entities(linking_out, coref_out):
    linking_qouts = []  
    for paragraph_links, coref_links in zip(linking_out, coref_out):
        local_linkes = []
        # import ipdb; ipdb.set_trace()
        for quote in paragraph_links:
            print(coref_links)
            print(quote)
            source_start, source_end = quote['Speaker_position'][0], quote['Speaker_position'][1]-1  

            new_source = list(filter(lambda x: x['coref_pos'][0] <= source_start and x['coref_pos'][1] >= source_end , coref_links))
            new_link = {k:v for k, v in quote.items() if k != 'Speaker_position'}
            if len(new_source) > 0:
                new_link['Speaker'] = new_source[0]['refer']
            
            local_linkes.append(new_link)
         
        linking_qouts.append(local_linkes)
    return linking_qouts


In [85]:
linked_entities = apply_coref_on_linked_entities([linked_entities[0]], [[coref_results[0]]])

[{'coref_pos': [140, 141], 'refer': 'just as unrest at nearby Port Sudan hurt farmers’ exports, the state-owned agricultural bank cut'}]
{'Speaker': 'Mohammed Siddig', 'Speaker_position': [9, 11], 'Cue': 'counts', 'Quote': 'the cost of the past year’s'}


In [86]:
linked_entities

[[{'Speaker': 'Mohammed Siddig',
   'Cue': 'counts',
   'Quote': 'the cost of the past year’s'}]]

## Vector Database

In [16]:
import chromadb
# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
client = chromadb.Client()

# Create collection. get_collection, get_or_create_collection, delete_collection also available!
collection = client.create_collection("test-documents")

# Add docs to the collection. Can also update and delete. Row-based API coming soon!
collection.add(
    documents=["This is document1", "This is document2"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
    metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on these!
    ids=["doc1", "doc2"], # unique for each doc
)

# Query/search 2 most similar results. You can also .get by id
results = collection.query(
    query_texts=["This is a query document"],
    n_results=2,
    # where={"metadata_field": "is_equal_to_this"}, # optional filter
    # where_document={"$contains":"search_string"}  # optional filter
)

[0;93m2025-11-29 12:51:04.152505002 [W:onnxruntime:Default, device_discovery.cc:164 DiscoverDevicesForPlatform] GPU device discovery failed: device_discovery.cc:89 ReadFileContents Failed to open file: "/sys/class/drm/card3/device/vendor"[m
/home/ahmed/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:26<00:00, 3.18MiB/s]


In [17]:
results

{'ids': [['doc1', 'doc2']],
 'embeddings': None,
 'documents': [['This is document1', 'This is document2']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'source': 'notion'}, {'source': 'google-docs'}]],
 'distances': [[0.9026353359222412, 1.035815954208374]]}