In this notebook, you will find a method to anonymise a sample of text, where we concentrate our attention on the case of anonymising names in articles. The languages permited are as follows: Arabic, German, English, Spanish, French, Italian, Latvian, Dutch, Portuguese and Chinese.

In [17]:
# Imports

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers.pipelines.token_classification import TokenClassificationPipeline
import pandas as pd
from tqdm import tqdm


In [2]:
# Let's try some Hugging face models for tokenising and classifying text

model_checkpoint = "Davlan/bert-base-multilingual-cased-ner-hrl"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

pipe = TokenClassificationPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple", stride=10)
ents = pipe("Emily works at BBC Studios in London.")
print(ents)

[{'entity_group': 'PER', 'score': 0.9998803, 'word': 'Emily', 'start': 0, 'end': 5}, {'entity_group': 'ORG', 'score': 0.9998514, 'word': 'BBC Studios', 'start': 15, 'end': 26}, {'entity_group': 'LOC', 'score': 0.9998155, 'word': 'London', 'start': 30, 'end': 36}]


In [3]:
# Functions to extend the model to handle longer than the initial caracters allowed, aka articles

def preprocess(self, sentence, offset_mapping=None):
        model_inputs = self.tokenizer(
            sentence,
            return_tensors="pt",
            truncation=True,
            return_special_tokens_mask=True,
            return_offsets_mapping=True,
            return_overflowing_tokens=True,  # Return multiple chunks
            max_length=self.tokenizer.model_max_length,
            padding=True
        )
        if offset_mapping:
            model_inputs["offset_mapping"] = offset_mapping

        model_inputs["sentence"] = sentence

        return model_inputs

def _forward(self, model_inputs):
        special_tokens_mask = model_inputs.pop("special_tokens_mask")
        offset_mapping = model_inputs.pop("offset_mapping", None)
        sentence = model_inputs.pop("sentence")
        overflow_to_sample_mapping = model_inputs.pop("overflow_to_sample_mapping")

        all_logits = torch.Tensor()
        num_chunks = len(model_inputs["input_ids"])

        # Pass one chunk at a time to the model and concatenate the results
        for i in range(num_chunks):
            model_input = {k: torch.unsqueeze(v[i], dim=0) for k, v in model_inputs.items()}
            logits = model(**model_input)[0]
            all_logits = torch.cat((all_logits, logits), dim=1)

        model_outputs = {
            "logits": all_logits,
            "special_tokens_mask": special_tokens_mask,
            "offset_mapping": offset_mapping,
            "sentence": sentence,
            "overflow_to_sample_mapping": overflow_to_sample_mapping,
            **model_inputs,
        }

        # We reshape outputs to fit with the postprocess inputs
        model_outputs["input_ids"] = torch.reshape(model_outputs["input_ids"], (1, -1))
        model_outputs["token_type_ids"] = torch.reshape(model_outputs["token_type_ids"], (1, -1))
        model_outputs["attention_mask"] = torch.reshape(model_outputs["attention_mask"], (1, -1))
        model_outputs["special_tokens_mask"] = torch.reshape(model_outputs["special_tokens_mask"], (1, -1))
        model_outputs["offset_mapping"] = torch.reshape(model_outputs["offset_mapping"], (1, -1, 2))

        return model_outputs

class TokenClassificationChunkPipeline(TokenClassificationPipeline):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def preprocess(self, sentence, offset_mapping=None):
        model_inputs = self.tokenizer(
            sentence,
            return_tensors="pt",
            truncation=True,
            return_special_tokens_mask=True,
            return_offsets_mapping=True,
            return_overflowing_tokens=True,  # Return multiple chunks
            max_length=self.tokenizer.model_max_length,
            padding=True
        )
        if offset_mapping:
            model_inputs["offset_mapping"] = offset_mapping

        model_inputs["sentence"] = sentence

        return model_inputs

    def _forward(self, model_inputs):
        special_tokens_mask = model_inputs.pop("special_tokens_mask")
        offset_mapping = model_inputs.pop("offset_mapping", None)
        sentence = model_inputs.pop("sentence")
        overflow_to_sample_mapping = model_inputs.pop("overflow_to_sample_mapping")

        all_logits = torch.Tensor()
        num_chunks = len(model_inputs["input_ids"])

        # Pass one chunk at a time to the model and concatenate the results
        for i in range(num_chunks):
            model_input = {k: torch.unsqueeze(v[i], dim=0) for k, v in model_inputs.items()}
            logits = model(**model_input)[0]
            all_logits = torch.cat((all_logits, logits), dim=1)

        model_outputs = {
            "logits": all_logits,
            "special_tokens_mask": special_tokens_mask,
            "offset_mapping": offset_mapping,
            "sentence": sentence,
            "overflow_to_sample_mapping": overflow_to_sample_mapping,
            **model_inputs,
        }

        # We reshape outputs to fit with the postprocess inputs
        model_outputs["input_ids"] = torch.reshape(model_outputs["input_ids"], (1, -1))
        model_outputs["token_type_ids"] = torch.reshape(model_outputs["token_type_ids"], (1, -1))
        model_outputs["attention_mask"] = torch.reshape(model_outputs["attention_mask"], (1, -1))
        model_outputs["special_tokens_mask"] = torch.reshape(model_outputs["special_tokens_mask"], (1, -1))
        model_outputs["offset_mapping"] = torch.reshape(model_outputs["offset_mapping"], (1, -1, 2))

        return model_outputs


In [5]:
  # Checking extension

pipe = TokenClassificationPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple", stride=10)
ents = pipe("A woman has claimed sightings of a lion- which sparked an extensive search by Essex Police - were of her large pet cat, Teddy Bear. A search for the very large animal seen in St Osyth, near Clacton-on-Sea, on Sunday was called off on Monday. Ginny Murphy said her ginger Maine Coon cat, the largest domestic breed, regularly wanders into the field where the animal was spotted. She said she believes Teddy Bear was mistaken for a lion by holidaymakers. 'Like to hunt' A search by police was triggered on Sunday evening when people staying at a caravan park reported sightings of a very large animal near Earl Hall Drive. Experts from Colchester Zoo and police firearms officers helped in the search. Police decided to call off the search after no trace of a big cat was found and they said the sightings were either of a large domestic cat or wildcat. Ms Murphy, of St Osyth, said: From the picture, he's identical - he's big, he's always out in the fields. Maine Coons like to hunt, and where he was is a particular area he likes to go. He's always coming back with birds. Her three-year-old pet is about 28in (70cm) in length. Gill and Steve Atkin, of Louth, Lincolnshire, photographed an animal in the field on Sunday afternoon. Mr Atkin had told police it was definitely a very large animal, and possibly a lion, definitely a large cat. He added: We witnessed it, I would say, for about 20 to 30 minutes cleaning itself and rolling about in the field. Speaking to the BBC on Tuesday, Mrs Atkin said: The Mirror [newspaper] has made a bit of a farce of it this morning, saying it was a cat called Tom, but no, I don't think it was a domestic cat. Whatever it was, it's definitely still out there. The first reported sighting was made by holidaymaker Bob Martin, who said he and his wife Denise saw a large cat and a lion was the first thing that came to mind. Cost of search We believe we saw a large cat looking at a tree... it just sat there looking at us, he said. Essex Police have not released any details of the cost of the search, but have said about 25 officers were called to where the animal was seen, including specialist firearms officers and experts from Colchester Zoo. Two police helicopters, one with thermal imaging equipment, were also used to try to detect any trace of an animal.")
print(ents)

[{'entity_group': 'ORG', 'score': 0.99182165, 'word': 'Essex Police', 'start': 78, 'end': 90}, {'entity_group': 'PER', 'score': 0.9101589, 'word': 'Teddy Bear', 'start': 120, 'end': 130}, {'entity_group': 'LOC', 'score': 0.99896306, 'word': 'St Osyth', 'start': 175, 'end': 183}, {'entity_group': 'LOC', 'score': 0.99515325, 'word': 'Clacton - on - Sea', 'start': 190, 'end': 204}, {'entity_group': 'PER', 'score': 0.99063396, 'word': 'Ginny Murphy', 'start': 242, 'end': 254}, {'entity_group': 'PER', 'score': 0.9322479, 'word': 'Teddy Bear', 'start': 400, 'end': 410}, {'entity_group': 'LOC', 'score': 0.99765104, 'word': 'Earl Hall Drive', 'start': 604, 'end': 619}, {'entity_group': 'ORG', 'score': 0.5572479, 'word': 'Col', 'start': 634, 'end': 637}, {'entity_group': 'LOC', 'score': 0.64672744, 'word': '##chester Zoo', 'start': 637, 'end': 648}, {'entity_group': 'PER', 'score': 0.9957657, 'word': 'Murphy', 'start': 855, 'end': 861}, {'entity_group': 'LOC', 'score': 0.997252, 'word': 'St Osy

In [6]:
  # Let's focus on names and anonymize

def anonymize(text):
    ents = pipe(text, ignore_labels=["O", "LOC", "ORG"])
    split_text = list(text)
    for ent in ents:
        split_text[ent['start']] = f"[{ent['entity_group']}]"
        for i in range(ent['start'] + 1, ent['end']):
            split_text[i] = ""

    return "".join(split_text)

In [39]:
text = "Un exempleado de la aeronáutica Boeing conocido por expresar su preocupación por los estándares de la producción de la empresa fue encontrado muerto en Estados Unidos. John Barnett había trabajado para Boeing durante 32 años, hasta su jubilación en 2017. En los días previos a su muerte, había estado testificando en un juicio contra la empresa tras denunciar irregularidades. Boeing manifestó su tristeza por el fallecimiento de Barnett. El oficial forense del condado de Charleston le confirmó su muerte a la BBC este lunes. Dijo que el hombre de 62 años había muerto de una herida de bala autoinfligida el 9 de marzo y que la policía estaba investigando."
anonymized_text = anonymize(text)
print(anonymized_text)

Un exempleado de la aeronáutica Boeing conocido por expresar su preocupación por los estándares de la producción de la empresa fue encontrado muerto en Estados Unidos. [PER] había trabajado para Boeing durante 32 años, hasta su jubilación en 2017. En los días previos a su muerte, había estado testificando en un juicio contra la empresa tras denunciar irregularidades. Boeing manifestó su tristeza por el fallecimiento de [PER]. El oficial forense del condado de Charleston le confirmó su muerte a la BBC este lunes. Dijo que el hombre de 62 años había muerto de una herida de bala autoinfligida el 9 de marzo y que la policía estaba investigando.


In [40]:
# Now that we've checked all works, let's apply it to BBC data. 

articles = pd.read_csv('AnomisedDataSample.csv')
print(articles["text"])



0        英王室は24日、英女王エリザベス2世の墓所を示す墓標の写真を公表した。新しく刻まれた墓標に...
1        The director of public prosecutions is bringi...
2        Four men have admitted causing disruption dur...
3        Liam Treadwell's teeth helped to make him one...
4        A woman has claimed sightings of a lion- whic...
                              ...                        
8757     A US police chief has apologised for an insen...
8758     Pakistani activists are calling for a high-le...
8759     Ukrainian President Volodymyr Zelensky has be...
8760      Fear, anger and lots of uncertainty.  That's...
8761     Hans Rosling, the Swedish professor who made ...
Name: text, Length: 8762, dtype: object


In [41]:
articlesAnon = articles[:100]
articlesAnon.head()

Unnamed: 0.1,Unnamed: 0,URL,JSON_CONTENT,text,anonymised_text,famous_persons
0,0,https://www.bbc.com/japanese/63024308.amp,"""{\""metadata\"": {\""id\"": \""urn:bbc:ares::asset...",英王室は24日、英女王エリザベス2世の墓所を示す墓標の写真を公表した。新しく刻まれた墓標に...,英王室は24日、英女王エリザベス2世の墓所を示す墓標の写真を公表した。新しく刻まれた墓標に...,"{""Queen Elizabeth II"": [""Queen Elizabeth II, (..."
1,1,https://www.bbc.com/news/uk-63030330,"""{\""metadata\"": {\""id\"": \""urn:bbc:ares::asset...",The director of public prosecutions is bringi...,The director of public prosecutions is bringi...,"{""Max Hill"": [""Max Hill, (https://en.wikipedia..."
2,2,https://www.bbc.com/news/uk-england-tyne-34585196,"""{\""metadata\"": {\""id\"": \""urn:bbc:ares::asset...",Four men have admitted causing disruption dur...,Four men have admitted causing disruption dur...,"{""Kyle Binks"": [""PERSON"", null], ""Newton Aycli..."
3,3,https://www.bbc.com/sport/horse-racing/38057367,"""{\""metadata\"": {\""id\"": \""urn:bbc:ares::asset...",Liam Treadwell's teeth helped to make him one...,"Liam Treadwell, (https://en.wikipedia.org/wik...","{""Clare Balding"": [""Clare Balding, (https://en..."
4,4,https://www.bbc.com/news/uk-england-essex-1939...,"""{\""metadata\"": {\""id\"": \""urn:bbc:ares::asset...",A woman has claimed sightings of a lion- whic...,A woman has claimed sightings of a lion- whic...,"{""Ginny Murphy"": [""PERSON"", null], ""Steve Atki..."


In [42]:

for row_index, cell in tqdm(articlesAnon.iterrows()):
    anonymized_text = anonymize(cell["text"])
    articlesAnon.loc[cell[0], "bert_anonymized_text"] = anonymized_text



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articlesAnon.loc[cell[0], "bert_anonymized_text"] = anonymized_text
100it [00:31,  3.17it/s]


In [43]:
articlesAnon.head()

Unnamed: 0.1,Unnamed: 0,URL,JSON_CONTENT,text,anonymised_text,famous_persons,bert_anonymized_text
0,0,https://www.bbc.com/japanese/63024308.amp,"""{\""metadata\"": {\""id\"": \""urn:bbc:ares::asset...",英王室は24日、英女王エリザベス2世の墓所を示す墓標の写真を公表した。新しく刻まれた墓標に...,英王室は24日、英女王エリザベス2世の墓所を示す墓標の写真を公表した。新しく刻まれた墓標に...,"{""Queen Elizabeth II"": [""Queen Elizabeth II, (...",英王室は24日、英女王[PER]の墓所を示す墓標の写真を公表した。新しく刻まれた墓標には、...
1,1,https://www.bbc.com/news/uk-63030330,"""{\""metadata\"": {\""id\"": \""urn:bbc:ares::asset...",The director of public prosecutions is bringi...,The director of public prosecutions is bringi...,"{""Max Hill"": [""Max Hill, (https://en.wikipedia...",The director of public prosecutions is bringi...
2,2,https://www.bbc.com/news/uk-england-tyne-34585196,"""{\""metadata\"": {\""id\"": \""urn:bbc:ares::asset...",Four men have admitted causing disruption dur...,Four men have admitted causing disruption dur...,"{""Kyle Binks"": [""PERSON"", null], ""Newton Aycli...",Four men have admitted causing disruption dur...
3,3,https://www.bbc.com/sport/horse-racing/38057367,"""{\""metadata\"": {\""id\"": \""urn:bbc:ares::asset...",Liam Treadwell's teeth helped to make him one...,"Liam Treadwell, (https://en.wikipedia.org/wik...","{""Clare Balding"": [""Clare Balding, (https://en...",[PER]'s teeth helped to make him one of the b...
4,4,https://www.bbc.com/news/uk-england-essex-1939...,"""{\""metadata\"": {\""id\"": \""urn:bbc:ares::asset...",A woman has claimed sightings of a lion- whic...,A woman has claimed sightings of a lion- whic...,"{""Ginny Murphy"": [""PERSON"", null], ""Steve Atki...",A woman has claimed sightings of a lion- whic...


In [45]:
articlesAnon.to_csv("100AnonymisedArticlesTwoModels.csv")