# INF442: Projet informatique 8

## GDPR in practice: data anonymization

This quick notebook will show you how to anonymize text data automatically. You'll have to do the same, by testing other approaches and / or embeddings and / or classification algorithms.

### Some dependencies and some data

#### Imports

In [34]:
import os
import gc
import logging
from random import sample
from urllib.request import urlopen
import numpy as np
import itertools
from tqdm import tqdm, trange
import sklearn as sk
import sklearn.linear_model
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer, BertModel, pipeline
logger = logging.getLogger(__name__)

#### Download some data

We'll download 3 text files into the "data" subfolder.

In [2]:
files = ["eng.testa", "eng.testb", "eng.train"]
url = "https://raw.githubusercontent.com/glample/tagger/master/dataset"
if not os.path.isdir("data"):
    os.mkdir("data")
for file in files:
    with open(f"data/{file}", 'a') as the_file:
        html = urlopen(f"{url}/{file}")
        for line in html:
            the_file.write(line.decode('UTF-8'))

### Some useful functions

In [3]:
class InputExample(object):
    """A single training/test example for token classification."""

    def __init__(self, guid, words: list, labels: list = []):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            words: list. The words of the sequence.
            labels: (Optional) list. The labels for each word of the sequence. This should be
                specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.words = words
        self.labels = labels


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids


def read_examples_from_file(data_dir, mode="eng.testa"):
    """Creating InputExamples out of a file"""
    file_path = os.path.join(data_dir, "{}".format(mode))
    guid_index = 1
    examples = []
    with open(file_path, encoding="utf-8") as f:
        words = []
        labels = []
        for line in f:
            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                if words:
                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
                    guid_index += 1
                    words = []
                    labels = []
            else:
                splits = line.split(" ")
                words.append(splits[0])
                if len(splits) > 1:
                    labels.append(splits[-1].replace("\n", ""))
                else:
                    # Examples could have no label for mode = "test"
                    labels.append("O")
        if words:
            examples.append(InputExample(guid="{}-{}".format(mode, guid_index), words=words, labels=labels))
    return examples


def convert_examples_to_features(
    examples,
    label_list,
    max_seq_length,
    tokenizer,
    cls_token_at_end=False,
    cls_token="[CLS]",
    cls_token_segment_id=1,
    sep_token="[SEP]",
    sep_token_extra=False,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    pad_token_label_id=-100,
    sequence_a_segment_id=0,
    mask_padding_with_zero=True,
):
    """ Loads a data file into a list of `InputBatch`s
        `cls_token_at_end` define the location of the CLS token:
            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d of %d", ex_index, len(examples))

        tokens = []
        label_ids = []
        for word, label in zip(example.words, example.labels):
            word_tokens = tokenizer.tokenize(word)
            tokens.extend(word_tokens)
            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
            label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))

        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = 3 if sep_token_extra else 2
        if len(tokens) > max_seq_length - special_tokens_count:
            tokens = tokens[: (max_seq_length - special_tokens_count)]
            label_ids = label_ids[: (max_seq_length - special_tokens_count)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens += [sep_token]
        label_ids += [pad_token_label_id]
        if sep_token_extra:
            # roberta uses an extra separator b/w pairs of sentences
            tokens += [sep_token]
            label_ids += [pad_token_label_id]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if cls_token_at_end:
            tokens += [cls_token]
            label_ids += [pad_token_label_id]
            segment_ids += [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            label_ids = [pad_token_label_id] + label_ids
            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
            label_ids = ([pad_token_label_id] * padding_length) + label_ids
        else:
            input_ids += [pad_token] * padding_length
            input_mask += [0 if mask_padding_with_zero else 1] * padding_length
            segment_ids += [pad_token_segment_id] * padding_length
            label_ids += [pad_token_label_id] * padding_length

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s", example.guid)
            logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
            logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))

        features.append(
            InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)
        )
    return features

### A direct, pre-trained NER approach

This piece of code will (almost) do directly what you'll have to accomplish in this *Projet*. Many things are hidden (that you'll have to uncover).

*Note* that this will take long the first time because model(s) are downloaded.

In [4]:
nlp = pipeline("ner")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [5]:
nlp("Adrien Ehrhardt donne un projet d'informatique aux étudiants de INF442.")

[{'entity': 'I-PER',
  'score': 0.99980503,
  'index': 1,
  'word': 'Ad',
  'start': 0,
  'end': 2},
 {'entity': 'I-PER',
  'score': 0.99964607,
  'index': 2,
  'word': '##rien',
  'start': 2,
  'end': 6},
 {'entity': 'I-PER',
  'score': 0.9998079,
  'index': 3,
  'word': 'E',
  'start': 7,
  'end': 8},
 {'entity': 'I-PER',
  'score': 0.99225086,
  'index': 4,
  'word': '##hr',
  'start': 8,
  'end': 10},
 {'entity': 'I-PER',
  'score': 0.9994708,
  'index': 5,
  'word': '##hardt',
  'start': 10,
  'end': 15},
 {'entity': 'I-ORG',
  'score': 0.8828938,
  'index': 22,
  'word': 'IN',
  'start': 64,
  'end': 66},
 {'entity': 'I-ORG',
  'score': 0.5023682,
  'index': 23,
  'word': '##F',
  'start': 66,
  'end': 67}]

We'd like to detect "Adrien Ehrhardt" and anonymize this part. Here it works quite well since input examples 'Ad', '##rien', 'E', '##hr', '##hardt' are being classified as PER. (Even INF is detected as an Organization).

### Obtaining the representation of the dataset from Bert

As suggested above, the whole process of choosing a dictionary, tokenizing the text, learning / obtaining a clever representation and learning / obtaining a classification model on top of that representation is hidden.

We'll have a quick look under the hood.

#### An example

We instantiate a tokenizer, and a model (which needs tokenized inputs). Again, it may take time as probably slightly different tokenizer + model are downloaded.

In [6]:
model = BertModel.from_pretrained("bert-base-cased")
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


The tokenizer has a vocabulary of size ~ 29k:

In [7]:
tokenizer.vocab

OrderedDict([('[PAD]', 0),
             ('[unused1]', 1),
             ('[unused2]', 2),
             ('[unused3]', 3),
             ('[unused4]', 4),
             ('[unused5]', 5),
             ('[unused6]', 6),
             ('[unused7]', 7),
             ('[unused8]', 8),
             ('[unused9]', 9),
             ('[unused10]', 10),
             ('[unused11]', 11),
             ('[unused12]', 12),
             ('[unused13]', 13),
             ('[unused14]', 14),
             ('[unused15]', 15),
             ('[unused16]', 16),
             ('[unused17]', 17),
             ('[unused18]', 18),
             ('[unused19]', 19),
             ('[unused20]', 20),
             ('[unused21]', 21),
             ('[unused22]', 22),
             ('[unused23]', 23),
             ('[unused24]', 24),
             ('[unused25]', 25),
             ('[unused26]', 26),
             ('[unused27]', 27),
             ('[unused28]', 28),
             ('[unused29]', 29),
             ('[unused30]', 30),
 

In [8]:
len(tokenizer.vocab)

28996

We tokenize the input.

In [9]:
sequence = "Adrien Ehrhardt donne un projet d'informatique aux étudiants de INF442."
tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
inputs = tokenizer.encode(sequence, return_tensors="pt")

`inputs` is the index of the tokens (as if you'd keep track of the page number in a Larousse).

In [10]:
inputs

tensor([[  101, 24930, 22500,   142,  8167, 16464,  1274,  1673,  8362,  5250,
         18836,   173,   112, 12862, 11745,  3530, 24544,   255,  7926, 10359,
          2145,  1260, 15969,  2271, 25041,  1477,   119,   102]])

Let's verify:

In [11]:
dictionary = list(tokenizer.vocab.items())
inputs_np = inputs.numpy()[0]

In [12]:
for index in inputs_np:
    print(dictionary[index])

('[CLS]', 101)
('Ad', 24930)
('##rien', 22500)
('E', 142)
('##hr', 8167)
('##hardt', 16464)
('don', 1274)
('##ne', 1673)
('un', 8362)
('pro', 5250)
('##jet', 18836)
('d', 173)
("'", 112)
('inform', 12862)
('##ati', 11745)
('##que', 3530)
('aux', 24544)
('é', 255)
('##tu', 7926)
('##dian', 10359)
('##ts', 2145)
('de', 1260)
('IN', 15969)
('##F', 2271)
('##44', 25041)
('##2', 1477)
('.', 119)
('[SEP]', 102)


We feed the resulting input into the model and retrieve the output.

In [13]:
outputs = model(inputs)[0]

In [14]:
outputs.detach().numpy()[0].shape

(28, 768)

In [15]:
outputs

tensor([[[ 0.4247,  0.1692,  0.0766,  ...,  0.1469,  0.6186, -0.0278],
         [ 0.6228, -0.3287,  0.4009,  ..., -0.1692,  0.2634, -0.0449],
         [-0.0646,  0.1704,  0.3089,  ...,  0.3286,  0.4970, -0.1050],
         ...,
         [ 0.0887,  0.0777,  0.3091,  ...,  0.6490,  0.6358,  0.0915],
         [-0.0842,  0.3524,  0.2369,  ...,  0.4639,  0.5785,  0.3579],
         [ 0.0724,  0.0754, -0.7835,  ..., -0.1286,  1.3864,  0.1951]]],
       grad_fn=<NativeLayerNormBackward0>)

Each line is the 768-dimensional representation of each of the 28 tokens in the sentence (out of a vocabulary size of ~29k)!

#### The real thing: doing the same for the CoNLL03 dataset

Let's calculate the output of the model, *i.e.* BERT's learned representation, of all the datasets we've downloaded. 

In [16]:
dataset = "eng.testb"  # change this to eng.testa / eng.testb to generate the corresponding files

In [17]:
pad_token_label_id = CrossEntropyLoss().ignore_index
# The labels in CoNLL03
labels = ["O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
# The model we'll use
model = BertModel.from_pretrained("bert-base-cased")  # Note: you might want to play with the model and associated Tokenizer!
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
# Reading the file
examples = read_examples_from_file(".", mode=f"data/{dataset}")

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
# Converting the words in tokens
features = convert_examples_to_features(
    examples,
    label_list=labels,
    max_seq_length=128,
    tokenizer=tokenizer,
    cls_token_at_end=False,
    cls_token=tokenizer.cls_token,
    cls_token_segment_id=0,
    sep_token=tokenizer.sep_token,
    sep_token_extra=False,
    pad_on_left=False,
    pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
    pad_token_segment_id=0,
    pad_token_label_id=pad_token_label_id)

In [19]:
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)

tensordataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

In [20]:
# As this is too computationally intensive, we'll do this by batch of 1 token to easily "stack"
# the tokens' representation (but it's not computationally efficient - although the difference in 
# performance will be more significant using a GPU)
eval_sampler = SequentialSampler(tensordataset)
eval_dataloader = DataLoader(tensordataset, sampler=eval_sampler, batch_size=1)

In [21]:
preds = []
out_label_ids = []
# This tells the model to only "evaluate" (forward-pass)
model.eval()
for batch in tqdm(eval_dataloader, desc="Evaluating"):
    # batch = tuple(t.to("gpu") for t in batch)  # if you have a compatible GPU, uncomment this
    with torch.no_grad():  # do not calculate gradients, as we won't be doing backward propagation
        inputs = {"input_ids": batch[0], 
                  "attention_mask": batch[1],
                  "token_type_ids": batch[2]}
        outputs = model(**inputs)[0].detach().cpu().numpy()  # last hidden layer from tensor to numpy array
        preds.append(outputs[0, (batch[3] != pad_token_label_id).cpu().numpy()[0], :])  # ditch padding
        out_label_ids.append(batch[3][0][(batch[3] != pad_token_label_id).cpu().numpy()[0]].cpu().numpy())

Evaluating: 100%|███████████████████████| 24171/24171 [1:12:22<00:00,  5.57it/s]


In [23]:
# Lists of 1D arrays to 2D arrays
# Each row represents a token
# We have lost the notion of sentence (but we don't care here)
# For other downstream tasks, such as Passage Retrieval, Question Answering,
# ... knowing the surrounding sentences is an important information.
representation = np.concatenate(preds, axis=0)
out_label_ids = np.concatenate(out_label_ids)
label_map = {i: label for i, label in enumerate(labels)}

In [24]:
# Convert label index (0, ..., 6) -> label ('O', 'PERS', ...)
def remap(x):
    return label_map[x]

vf = np.vectorize(remap)
true_labels = vf(out_label_ids)

In [25]:
true_labels

array(['O', 'O', 'I-LOC', ..., 'O', 'I-PER', 'O'], dtype='<U6')

In [26]:
true_labels.shape

(324919,)

In [27]:
representation

array([[ 0.26978433, -0.39845598,  0.40575168, ...,  0.01292597,
         0.24568917,  0.04700899],
       [-0.22996216, -0.19745646,  0.56731385, ...,  0.42715392,
         0.40921098, -0.06547163],
       [ 0.1031662 , -0.22714055,  0.72148806, ...,  0.29335234,
         0.70935506, -0.35829127],
       ...,
       [ 0.56735927,  0.4612093 ,  0.23810202, ...,  0.33974737,
        -0.7844839 ,  0.37409455],
       [-0.34889153, -0.24226657,  0.29281753, ...,  0.30067262,
         0.12375018,  0.5540694 ],
       [ 0.22828603, -0.7856831 , -0.08802494, ...,  0.24558005,
        -0.22777386,  0.10435171]], dtype=float32)

In [28]:
representation.shape

(324919, 768)

In [29]:
assert true_labels.shape[0] == representation.shape[0]

In [30]:
np.save(f"data/true_labels.{dataset}.npy", true_labels)

In [31]:
np.save(f"data/representation.{dataset}.npy", representation)

In [32]:
# Housekeeping
del preds, out_label_ids
gc.collect()

499

### Example of a simple classifier on top

Now that we've covered the hard part (converting words to a useful numerical representation), we can classify them as we would for iris flowers (e.g. with a Logistic Regression - here with cross-validation)!

*Note*: **yes**, these 3 lines are a satisfactory solution to Subproblem 1 (with performance metrics and comments obviously).

In [32]:
model_top = sk.linear_model.LogisticRegressionCV()

In [33]:
model_top.fit(X=representation, y=true_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [34]:
model_top.score(X=representation, y=true_labels)

0.999980515241952

### Loading the data: example

If you plan to use some Python in your project, here's how you can load the numpy data:

In [109]:
true_labels = np.load("data/true_labels.eng.train.npy")

In [42]:
true_labels

array(['I-ORG', 'O', 'I-MISC', ..., 'O', 'I-ORG', 'O'], dtype='<U6')

In [110]:
representation = np.load("data/representation.eng.train.npy")

In [43]:
representation

array([[ 0.26978433, -0.39845598,  0.40575168, ...,  0.01292597,
         0.24568917,  0.04700899],
       [-0.22996216, -0.19745646,  0.56731385, ...,  0.42715392,
         0.40921098, -0.06547163],
       [ 0.1031662 , -0.22714055,  0.72148806, ...,  0.29335234,
         0.70935506, -0.35829127],
       ...,
       [ 0.56735927,  0.4612093 ,  0.23810202, ...,  0.33974737,
        -0.7844839 ,  0.37409455],
       [-0.34889153, -0.24226657,  0.29281753, ...,  0.30067262,
         0.12375018,  0.5540694 ],
       [ 0.22828603, -0.7856831 , -0.08802494, ...,  0.24558005,
        -0.22777386,  0.10435171]], dtype=float32)

### Loading and converting to CSV: files get too big, you'll have to do this yourself and sample!

In [41]:
for file in files:
    loaded = np.load(f"data/representation.{file}.npy")
    n_samples = 10000 if file == "eng.train" else 2000
    samples = sample(list(range(loaded.shape[0])), n_samples)
    np.savetxt(f"data/representation.{file}.csv", loaded[samples,:], delimiter=",")
    true_labels = np.load(f"data/true_labels.{file}.npy")
    np.savetxt(f"data/true_labels.{file}.csv", true_labels[samples], delimiter=",", fmt='%s')