In [20]:
from presidio_analyzer import AnalyzerEngine
import pandas as pd
import os
from tqdm import tqdm
import re
import wikipediaapi
from presidio_anonymizer import OperatorConfig
from presidio_anonymizer.entities import RecognizerResult, OperatorConfig
from presidio_anonymizer import AnonymizerEngine
import numpy as np
import time
import string
import csv
import sys
import json


In [21]:
def check_if_exists(wiki, entry):
    page_exists = wiki.page(entry)
    if page_exists.exists():
        return (f"{entry}, ({page_exists.fullurl})", page_exists.summary)
    else:
        return (f"PERSON", None)

def find_previous_next(word: set, text: str):
    next_pattern = r'{0}\s*(\w+)'.format(word)
    previous_pattern = r'(\w+)\s*{0}'.format(word)

    match_next = re.search(next_pattern, text)
    match_previous = re.search(previous_pattern, text)

    # If a match is found, extract the word
    if match_next:
        next = match_next.group(1)  # group(1) refers to the first captured group
        if next[0].isupper():
            word = word + " " + next  

    if match_previous:
        previous = match_previous.group(1)  # group(1) refers to the first captured group
        if previous[0].isupper():
            word = previous + " " + word 
    #     print("The word previous {0} is: {1}".format(word, previous))
    # else:
    #     print("No match found.")

    return word

def anonymise_text(text, analyzer =  AnalyzerEngine(), anonymizer = AnonymizerEngine()):

    # step 1: use analyser with spacy to get the names
    results = analyzer.analyze(text=text,
                            entities=["PERSON"],
                            language='en')

    # step 2: heuristic of checking the previous and next words for entries that are/are not names
    modified_set = []
    translation_table = str.maketrans('', '', string.punctuation)

    for word in [text[e.start:e.end] for e in results]:
        word = word.translate(translation_table)
        res = find_previous_next(word, text)
        if len(res.split(" ")) == 1:
            modified_set.append("Not a person")
        else:
            modified_set.append(res)

    # step 3: call wikipedia api to check if there is an entry. If there is, we assume its a famous person
    person_dict = dict()
    wiki_wiki = wikipediaapi.Wikipedia('Checking if pages exist', 'en')

    for e in [entry for entry in set(modified_set) if entry!="Not a person"]:
        person_dict[e] = check_if_exists(wiki_wiki, e)
        
    # step 4: preparing data for anonymiser
    filtered_set = [person_dict[e][0] if e != "Not a person" else "Not a person" for e in modified_set]
    zipped = [i for i in list(zip(results, filtered_set)) if i[1]!="Not a person"]


    operators_dict = dict()
    results_custom = list()

    for e, entry in enumerate(zipped):
        operators_dict[f"ENTRY_{e}"] = OperatorConfig("replace", {"new_value": zipped[e][1]})
        results_custom.append(RecognizerResult(entity_type=f"ENTRY_{e}", start=entry[0].start, end=entry[0].end, score=entry[0].score))


    # step 5: anonymise text
    anonymised_result = anonymizer.anonymize(
        text=text,
        analyzer_results=results_custom,
        operators=operators_dict,
    )
    
    return anonymised_result.text, person_dict


In [22]:

articles = pd.read_csv('sample_articles.csv')

for row in tqdm(articles.iterrows(), total=articles.shape[0]):
    text = row[1]["text"]
    anonymised_text, famous_persons = anonymise_text(text)

    articles.loc[row[0], "anonymised_text"] = anonymised_text
    articles.loc[row[0], "famous_persons"] = json.dumps(famous_persons)

    time.sleep(np.random.randint(1,5))

100%|██████████| 8762/8762 [11:18:13<00:00,  4.64s/it]  


In [39]:
print(articles["JSON_CONTENT"])
articles.to_csv('AnomisedDataSample.csv', index=True)  

0       "{\"metadata\": {\"id\": \"urn:bbc:ares::asset...
1       "{\"metadata\": {\"id\": \"urn:bbc:ares::asset...
2       "{\"metadata\": {\"id\": \"urn:bbc:ares::asset...
3       "{\"metadata\": {\"id\": \"urn:bbc:ares::asset...
4       "{\"metadata\": {\"id\": \"urn:bbc:ares::asset...
                              ...                        
8757    "{\"metadata\": {\"id\": \"urn:bbc:ares::asset...
8758    "{\"metadata\": {\"id\": \"urn:bbc:ares::asset...
8759    "{\"metadata\": {\"id\": \"urn:bbc:ares::asset...
8760    "{\"metadata\": {\"id\": \"urn:bbc:ares::asset...
8761    "{\"metadata\": {\"id\": \"urn:bbc:ares::asset...
Name: JSON_CONTENT, Length: 8762, dtype: object


In [26]:
from langchain.prompts.prompt import PromptTemplate
from langchain.llms import Ollama

def compare_contexts(person, wiki_context, article_context):
    template = """
        [INST]<<SYS>> You are an assistant for question-answering tasks. 
        If you don't know the answer, just say that you don't know. 
        Use the following informatuib to determine if the {person} is the same in both contexts:<</SYS>> 

        Context 1: {wiki_context} 
        Context 2: {article_context} 

        Answer: [/INST]
        """

    prompt_template = PromptTemplate(
            input_variables=["person", "wiki_context", "article_context"],
            template=template,
    )

    llm = Ollama(
    model="llama2",
    temperature=0.2,
    num_ctx=2048 * 4,
    repeat_last_n=-1,
    top_p=0.5,
    top_k=40,
)

    prompt_and_model = prompt_template | llm
    output = prompt_and_model.invoke({"person": person, "wiki_context": wiki_context, "article_context": article_context})

    return output

In [47]:
compare_contexts("Hans Rosling",
                 [famous_persons[i] for i in famous_persons.keys() if famous_persons[i][0] !="PERSON"][0][1],
                 text)
print(tuple(famous_persons[i] for i in famous_persons.keys()))

(('Hans Rosling, (https://en.wikipedia.org/wiki/Hans_Rosling)', 'Hans Rosling (Swedish pronunciation: [ˈhɑːns ˈrûːslɪŋ]; 27 July 1948 – 7 February 2017) was a Swedish physician, academic and public speaker. He was a professor of international health at Karolinska Institute and was the co-founder and chairman of the Gapminder Foundation, which developed the Trendalyzer software system. He held presentations around the world, including several TED Talks in which he promoted the use of data (and data visualization) to explore development issues. His posthumously published book Factfulness, coauthored with his daughter-in-law Anna Rosling Rönnlund and son Ola Rosling, became an international bestseller.'),)


In [49]:
print(articles["famous_persons"])

0       {"Queen Elizabeth II": ["Queen Elizabeth II, (...
1       {"Max Hill": ["Max Hill, (https://en.wikipedia...
2       {"Kyle Binks": ["PERSON", null], "Newton Aycli...
3       {"Clare Balding": ["Clare Balding, (https://en...
4       {"Ginny Murphy": ["PERSON", null], "Steve Atki...
                              ...                        
8757                                                   {}
8758    {"Afzal Kohistani": ["Afzal Kohistani, (https:...
8759    {"President Volodymyr Zelensky": ["PERSON", nu...
8760    {"Minister Anil Vij": ["PERSON", null], "the L...
8761    {"Hans Rosling": ["Hans Rosling, (https://en.w...
Name: famous_persons, Length: 8762, dtype: object


In [50]:
articles["famous_persons"].to_csv("FamousAndNonFamousPeople.csv")

In [67]:
articles[articles["famous_persons"].str.contains("PERSON")]["famous_persons"].to_csv("NotFamous.csv")
articles[~articles["famous_persons"].str.contains("PERSON")]["famous_persons"].to_csv("Famous.csv")






In [68]:
articles[articles["famous_persons"].str.contains("PERSON")].to_csv("ALLNotFamous.csv")
articles[~articles["famous_persons"].str.contains("PERSON")].to_csv("ALLFamous.csv")