In [None]:
!pip install fastcoref gdown torch transformers
!python -m spacy download en_core_web_trf
!pip install spacy[transformers]
!git clone https://github.com/Ahmed-Khaled-Saleh/npat.git

In [None]:
import gdown
import os
url_json = "https://drive.google.com/file/d/18A78dkzkfgTTPVjvsueDmIRxKUx4YDDX/view?usp=sharing"
os.makedirs("./data", exist_ok= True)
output = "data/cleaned_economics_data.json"
gdown.download(url_json, output= output, fuzzy= True)

In [None]:
import json
with open('./data/cleaned_economics_data.json', 'r') as f:
    articles = json.load(f)

In [3]:
len(articles['data'])

1186

## Quotation Extraction

In [None]:
checkpoint = "https://drive.google.com/file/d/11yCgOGHXJuT3-7sENdRR2Zeu8r3pR5cK/view?usp=drive_link"
model_path = os.path.join(".","src/cofenet/checkpoint/model_6000.bin")
gdown.download(checkpoint, output= model_path, fuzzy=True)

In [None]:
from src.cofenet.model.mod_bert import ModelBert_Cofe
import torch

model_cofe = ModelBert_Cofe()
state_dict = torch.load(model_path, map_location=torch.device('cpu'))
model_cofe.load_state_dict(state_dict)

In [24]:
import torch
from src.cofenet.utils.utils import *
from src.cofenet.utils.loader import SingleDataLoader
from src.cofenet.utils.dataset import DatasetBert
from torch.utils.data import SequentialSampler

In [4]:
def tgidss2tgstrss(tgidss, tags_file_path ,lengths=None):
    tgstrss = []
    map_tg2tgid = {tag: idx for idx, tag in enumerate(load_text_file_by_line(tags_file_path))}
    map_tgid2tg = {idx: tag for tag, idx in map_tg2tgid.items()}
    
    if lengths is None:
        for tgids in tgidss:
            tgstrss.append([map_tgid2tg[tgid] for tgid in tgids])
    else:
        for tgids, length in zip(tgidss, lengths):
            tgstrss.append([map_tgid2tg[tgid] for tgid in tgids[:length]])
    return tgstrss


In [5]:
import os
import json
def read_data(file_path):
    if not os.path.exists(file_path):
        raise Exception('data file_path is not exist')

    with open(file_path, 'r', encoding='utf-8') as file_object:  
        data = json.load(file_object)
    return data


In [6]:
import re
import uuid
def doc_text_preprocessing(doc):
    global split_pargraphs
    
    def clean_text(txt):
        txt = re.sub(r'(\“|\”)', "\"", txt)
        txt = re.sub(r'[^a-zA-Z0-9 \.\'\"\,\-\(\)\’\$\#\@]', "", txt)
        txt = re.sub(r'(\( )', "(", txt)
        txt = re.sub(r'( \))', ")", txt)
        txt = re.sub(r'( \.)', ".", txt)
        txt = re.sub(r'( \,)', ",", txt)
        txt = re.sub(r'(.)\.(.)', r'\1. \2', txt)
        txt = re.sub(r' +', " ", txt)
        txt = re.sub(r'([a-z])\.([a-z])', r'\1 \2', txt)
        return txt.strip()

    # clean text
    doc['maintext'] = clean_text(doc['maintext'])

    # check if there is no paragraphs to start split maintext
    if len(doc['paragraphs']) == 0:
       doc['paragraphs'] = split_pargraphs(doc['maintext'])

    doc['paragraphs']  = list(filter(lambda x: len(x) > 1, map(lambda txt: clean_text(txt), doc['paragraphs'])))

    # identify doc with id
    doc['ID'] = str(uuid.uuid3(uuid.NAMESPACE_URL, doc['url']))
    return doc



In [None]:
import traceback

def handel_error(fun, doc):
    try:
        return fun(doc)
    except Exception as e:
        traceback.print_exc()
        return None


# data = read_data('./data/cleaned_economics_data.json')['data']
docs = map(lambda doc: handel_error(doc_text_preprocessing, doc), articles['data'])
docs = list(filter(lambda x: x != None, docs))

In [29]:
len(docs)

1186

In [8]:
import torch
from src.cofenet.utils.utils import *
from src.cofenet.utils.loader import SingleDataLoader
from src.cofenet.utils.dataset import DatasetBert
from torch.utils.data import SequentialSampler


def extract_quotes(infer_str:list, model_cofe) -> list:

    file_path = read_write_str(infer_str, "./src/cofenet/infer_file.txt")
    dataset = DatasetBert(file_path)
    tag_file_path = './src/cofenet/utils/tag.txt'

    dataloder = SingleDataLoader(dataset=dataset,
                                batch_size=32,
                                sampler=SequentialSampler(dataset),
                                collate_fn=dataset.collate)
    preds = []
    for batch_data in dataloder:
        model_cofe.eval()
        with torch.no_grad():
            batch_preds = model_cofe.predict(batch_data)
            
            batch_pred_strs = tgidss2tgstrss(
                batch_preds.data.cpu().numpy() if not isinstance(batch_preds, list) else batch_preds, tag_file_path,
                batch_data['lengths'].cpu().numpy())

            preds.extend(batch_pred_strs)

    os.remove("./src/cofenet/infer_file.txt")
    return preds




  from .autonotebook import tqdm as notebook_tqdm


In [9]:
def quote_cue_source_extraction(doc):

    def clean_text(txt):
        txt = txt.lower()
        # txt = re.sub(r'[^a-zA-Z0-9 \.\$\#\@]', "", txt)
        # txt = re.sub(r' +', " ", txt)
        # txt = re.sub(r'(\. |\.$)', " ", txt)
        # txt = re.sub(r'([a-z])\.([a-z])', r'\1 \2', txt)
        return txt.strip()

    paragraphs = doc['paragraphs']

    # extract Cue, Source, and Quotes
    predict_entities = extract_quotes(paragraphs, model_cofe)
    return doc, predict_entities

In [None]:
ents = []
for doc in docs:
    try:
        doc, predict_entities = quote_cue_source_extraction(doc)
        ents.append(predict_entities)
    except Exception as e:
        print(f"Error processing document {doc['title']}: {e}")
        ents.append([])

In [None]:
import json

output_filename = 'extracted_entities.json'

with open(output_filename, 'w') as f:
    json.dump(ents, f)

print(f"List 'ents' successfully saved to {output_filename}")

In [None]:
with open('./extracted_entities.json', 'r') as f:
    ents = json.loads(f)

## Coref Resolution

In [None]:
from spacy.tokens import Doc, Span
import spacy
from typing import List
from fastcoref import spacy_component

nlp = spacy.load("en_core_web_trf")
nlp.add_pipe("fastcoref")

def coref_resolver(txt, bio):
    global nlp

    def get_span_noun_indices(doc: Doc, cluster: List[List[int]]) -> List[int]:
        spans = [doc[span[0]:span[1]+1] for span in cluster]
        spans_pos = [[token.pos_ for token in span] for span in spans]
        span_noun_indices = [i for i, span_pos in enumerate(spans_pos)
            if any(pos in span_pos for pos in ['NOUN', 'PROPN'])]
        return span_noun_indices

    def get_cluster_head(doc: Doc, cluster: List[List[int]], noun_indices: List[int]):
        head_idx = noun_indices[0]
        head_start, head_end = cluster[head_idx]
        head_span = doc[head_start:head_end+1]
        return head_span, [head_start, head_end]

    def is_containing_other_spans(span: List[int], all_spans: List[List[int]]):
        return any([s[0] >= span[0] and s[1] <= span[1] and s != span for s in all_spans])

    def improved_replace_corefs(document, clusters):
        all_spans = [span for cluster in clusters for span in cluster]
        coref_results = []
        for cluster in clusters:
            noun_indices = get_span_noun_indices(document, cluster)
            if noun_indices:
                mention_span, mention = get_cluster_head(document, cluster, noun_indices)
                for coref in cluster:
                    if coref != mention and not is_containing_other_spans(coref, all_spans):
                        coref_results.append({"coref_pos": coref, "refer": mention_span.text})
        return coref_results

    try:
        doc = nlp(txt)
        coref_clusters = doc._.coref_clusters
        clusters = []
        # import ipdb; ipdb.set_trace()
        for cluster in coref_clusters:
            spans = []
            
            for mention in cluster:
                start = mention[0]
                end = mention[1] - 1  # match AllenNLP indexing
                spans.append([start, end])
            clusters.append(spans)
    except Exception as e:
        print("*"*20)
        print(f"Error processing: {txt}")
        print(e)
        print("*"*20)
        return []

    # build spaCy doc with BIO entities
    words = txt.split(' ')
    spaces = [True] * len(words)
    doc_ = Doc(nlp.vocab, words=words, spaces=spaces, ents=bio)
    doc = nlp(doc_)
    coref_results = improved_replace_corefs(doc, clusters)
    return coref_results


In [None]:
coref_res = []
for idx, doc in enumerate(docs):
    for j, p in enumerate(doc['paragraphs']):
        coref_res.append(coref_resolver(p, ents[idx][j]))
        

## Attribution

In [None]:



def enhance_source(span, enhance_if_large_than=4):
    global nlp
    position = [span.start, span.end]
    doc = nlp(Doc(nlp.vocab, words=span.text.split(' ')))
    if len(doc) <= enhance_if_large_than: 
        return span.text, position

    # get first person entity
    ent = list(filter(lambda x: x.label_ == 'PERSON', doc.ents))[0]

    left = []
    for i in range(1,6):
        tok = doc[ent.start-i]
        if ent.start-i < 0 or tok.pos_ not in {"PROPN", "PRON"}: break
        left.append(tok.text)

    right = []
    for i in range(1,6):
        if ent.start+i > len(doc)-1 or tok.pos_ not in {"PROPN", "PRON"}: break
        tok = doc[ent.start+i]
        right.append(tok.text)


    local_pos = [ent.start - len(left), ent.end + len(right)]
    position[0] = position[0] + local_pos[0]
    position[1] = position[1] - (len(doc) - local_pos[1])
    return doc[local_pos[0]: local_pos[1]].text.strip(), position



def entity_linking(paragraphs, bios):
    global nlp

    linked_entities = []
    for i, content in enumerate(zip(paragraphs, bios)):
        local_linked_entities = []
        # decompress the tuple
        paragraph, bio = content

        # the previous pargraph preparation
        add_words, add_bio = [], []
        # in case of first paragraph will ignore it, becouse there is no pargraphs before first one.
        if i != 0:
            # split to words
            add_words = paragraphs[i-1].split(' ')
            add_bio = bios[i-1]

        # split to words and combine the previous paragraph with the current one.
        words = add_words + paragraph.split(' ')
        # prepare the spaces 
        spaces = [True]*len(words)
        # combine the previous bio with the current one.
        bio_ = add_bio + bio 

        # create Doc with its entities
        doc_ = Doc(nlp.vocab, words=words, spaces=spaces, ents=bio_)
        # feed the doc to default spacy pipeline to get the dependency tree and POS tags
        doc = nlp(doc_)

        # assigne the new doc ents with our entites.
        doc.ents = doc_.ents

        cues = list(filter(lambda ent: ent.label_ == 'cue', doc.ents)) # get list of cue-verb entities
        sources = list(filter(lambda ent: ent.label_ == 'source', doc.ents)) # get list of source entities
        contents = list(filter(lambda ent: ent.label_ == 'content', doc.ents)) # get list of content entities

        # loop on each cue
        for cue in cues:
            # get only the verb word from cue, becaues cue and has many words
            verb = None
            verbs = list(filter(lambda tok: tok.pos_ =='VERB', cue))
            if len(verbs) > 0:
                verb = verbs[0] # get the first one

            # get the source of cue based on POS & dependency tree
            try:
                source = None
                # check all verb's children, if any one of them is labeled as a source entity.
                source_part = next((child for child in verb.children if child.ent_type_ == 'source'), None)
                # in case if no verb's children exist as source entity, look at the head "Conj" 
                if source_part == None:
                    temp_verb = verb
                    # loop untill get the source
                    out_ = 0
                    while temp_verb.dep_ != 'ROOT' or temp_verb.pos_ != 'VERB':
                        temp_verb = temp_verb.head
                        source_part = next((child for child in temp_verb.children if child.ent_type_ == 'source'), None)
                        if out_ > 5: break
                        out_+=1
                
                # get the original entity of source_part 
                for ent in sources:
                    if source_part.i >= ent.start and source_part.i <= ent.end:
                        source = ent
                        break
            except:
                pass

            try:
                quote = None
                # check all verb's children, if any one of them is labeled as a content entity.
                qoute_part = next((child for child in verb.children if child.ent_type_ == 'content'), None)
                if qoute_part == None and verb.head.ent_type_ == 'content':
                    qoute_part = verb.head

                # get the original entity of qoute_part 
                for ent in contents:
                    if qoute_part.i >= ent.start and qoute_part.i <= ent.end:
                        quote = ent
                        break
            except:
                pass

            try:
                # to get only quotes from the current paragraph, and not include quotes from the previous one.
                if quote[0].i >= len(add_words):
                    enhanced_source, position = enhance_source(source)
                    obj = {"Speaker": enhanced_source,
                           "Speaker_position": position, # useing in corfrence resolution 
                           "Cue": verb.text,
                           "Quote": quote.text,
                        #    "Quote_polarity": get_polarity([quote.text])[0],
                        #    "Quote_summarization": summerizer_model.predict(quote.text)[0] if len(quote.text.split(' ')) > 20 else quote.text
                    }
                    local_linked_entities.append(obj)
            except:
                pass
        
        linked_entities.append(local_linked_entities)
    return linked_entities



In [None]:
linked_ents = []
for idx, doc in enumerate(docs):
    try:
        linked_en = entity_linking(doc['paragraphs'], ents[idx])
        linked_ents.append(linked_en)
    except Exception as e:
        print(f"Error processing document {doc['title']}: {e}")
        linked_ents.append([])

In [None]:
def apply_coref_on_linked_entities(linking_out, coref_out):
    linking_qouts = []  
    for paragraph_links, coref_links in zip(linking_out, coref_out):
        local_linkes = []
        # import ipdb; ipdb.set_trace()
        for quote in paragraph_links:
            print(coref_links)
            print(quote)
            source_start, source_end = quote['Speaker_position'][0], quote['Speaker_position'][1]-1  

            new_source = list(filter(lambda x: x['coref_pos'][0] <= source_start and x['coref_pos'][1] >= source_end , coref_links))
            new_link = {k:v for k, v in quote.items() if k != 'Speaker_position'}
            if len(new_source) > 0:
                new_link['Speaker'] = new_source[0]['refer']
            
            local_linkes.append(new_link)
         
        linking_qouts.append(local_linkes)
    return linking_qouts


In [None]:
linked_enents_coref = []
for idx in range(len(docs)):
    try:
        linked_en = apply_coref_on_linked_entities(linked_ents[idx], coref_res[idx])
        linked_enents_coref.append(linked_en)
    except Exception as e:
        print(f"Error processing document {docs[idx]['title']}: {e}")
        linked_enents_coref.append([])