In [None]:
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statistics
import json
from transformers import pipeline
from tokenizers.pre_tokenizers import Whitespace

# Pretrained models

In [None]:
def plotPerplexityHist(data):
    perplexities = pd.Series(data)

    perplexities.plot.hist(grid=True, density=True, bins=100, edgecolor='black')
    plt.axvline(statistics.mean(data), color="red")
    plt.title('Perplexity frequency')
    plt.xlabel('Perplexities')
    plt.ylabel('Frequency')

## before training

In [None]:
with open("path/to/trained/model/not-pretrained_perplexity.json", "r", encoding="utf-8") as f:
    jsonfile = json.load(f)
    print(str(statistics.mean(jsonfile)), " +- ", str(statistics.stdev(jsonfile)))
    print(str(statistics.median(jsonfile)))
    print(str(min(jsonfile)))
    print(str(max(jsonfile)))
    plotPerplexityHist(jsonfile)

## After pretraining

In [None]:
with open("path/to/trained/model/pretrained_perplexity.json", "r", encoding="utf-8") as f:
    jsonfile = json.load(f)
    print(str(statistics.mean(jsonfile)), " +- ", str(statistics.stdev(jsonfile)))
    print(str(statistics.median(jsonfile)))
    print(str(min(jsonfile)))
    print(str(max(jsonfile)))
    plotPerplexityHist(jsonfile)

## Test on example texts

In [None]:
example_dataset = {
    "Contrôle 1 an 1/2 après cystoprostatectomie radicale avec Bricker.":
    "Contrôle 1 an 1/2 après <mask> radicale avec Bricker.",
    "A ce stade, il existe des ondes lentes diphasiques dans les deux régions frontales intermittentes.":
    "A ce stade, il existe des ondes lentes <mask> dans les deux régions frontales intermittentes.",
    "Ordonnance bi-zone Prescriptions relatives au traitement de l'affection de longue durée.":
    "Ordonnance bi-zone <mask> relatives au traitement de l'affection de longue durée.",
    "Le contrôle de la fistule huméro-basilique gauche est plutôt bon":
    "Le contrôle de la <mask> huméro-basilique gauche est plutôt bon",
    "Déviation du dorsum nasal objectivée et subjective.":
    "Déviation du <mask> nasal objectivée et subjective."
}

In [None]:
classifiers = {
    "not-pretrained": pipeline("fill-mask", "path/to/base/model"),
    "pretrained": pipeline("fill-mask", "path/to/trained/model")
}

In [None]:
for text in example_dataset:
    print(text)
    print(example_dataset[text])

    for classifier_name in classifiers:
        print(classifier_name+":")
        results = classifiers[classifier_name](example_dataset[text])

        for result in results:
            print(result["token_str"]+" ("+str(round(result["score"], 2))+")")

# Fine-tuned model

In [None]:
cross_results = {
    "camembert-base": {
        "not-pretrained-finetuned": pd.read_csv("path/to/finetuned/model/eval-finetuning.csv", sep=";"),
        "pretrained-finetuned": pd.read_csv("path/to/pretrained/and/finetuned/model/eval-finetuning.csv", sep=";")
    }
}

In [None]:
print("label\t\tprecision\t\trecall\t\tf1-score")
for label in ["CodePostal", "Ville", "NomPrenom", "Voie", "IPP", "Date", "NoDossier", "Organisation", "SiteWeb", "EMail", "Localite", "Telephone"]:
    print(label, end="")
    for score in ["precision", "recall", "f1"]:
        key = label+"-"+score
        for model in cross_results:
            for version in cross_results[model]:
                if key in cross_results[model][version]:
                    result = cross_results[model][version][key]
                    print(
                        "\t\t"+
                        str(round(statistics.mean(result), 3))+
                        " ± "+
                        str(round(statistics.stdev(result), 3)),
                        end=""
                     )
    print("")

# Risk of re-identification

In [None]:
raw_validset = []
with open("data/data-for-trf-validation.json", "r", encoding="utf-8") as f:
    print("loading json...")
    jsonfile = json.load(f)
    print("json loaded")
    raw_validset = [datum["file.contenu"] for datum in tqdm(jsonfile)]
print(len(raw_validset))

In [None]:
pre_tokenizer = Whitespace()
embedding_dim = 512
subtext_size = int(embedding_dim / 2)
stride = int(subtext_size / 2)

chunked_validset = []
for seq in tqdm(raw_validset):
    tokenized_seq = pre_tokenizer.pre_tokenize_str(seq)
    if len(tokenized_seq) <= subtext_size:
        chunked_validset.append(seq)
    else:
        for i in range(stride, len(tokenized_seq), stride):
            sub_tokenized_seq = tokenized_seq[i-stride:i+stride]
            sub_seq_start = sub_tokenized_seq[0][1][0]
            sub_seq_end = sub_tokenized_seq[-1][1][1]
            sub_seq = seq[sub_seq_start:sub_seq_end]
            if len(sub_seq) != 0:
                chunked_validset.append(sub_seq)
print(len(chunked_validset))

In [None]:
anon_model = pipeline(
    "token-classification", 
    model="path/to/finetuned/model",
    aggregation_strategy="simple"
)
anon_model.tokenizer.model_max_length = 512

In [None]:
anon_validset = []
nbMask = 0
for text in tqdm(chunked_validset):
    entities = anon_model(text)
    anon_text = str(text) #cpy
    for entity in entities:
        anon_text = anon_text.replace(entity["word"], "<mask>", 1)
    anon_validset.append({"text": anon_text, "entities": entities})
    nbMask += len(entities)
nbMask

In [None]:
deanon_model = pipeline(
    "fill-mask", 
    model="path/to/trainedmodel"
)
deanon_model.tokenizer.model_max_length = 512

In [None]:
nbFound = 0
for anon_text in tqdm(anon_validset):
    try:
        results = deanon_model(anon_text["text"])
        if not isinstance(results[0], list):
            results = [results]
        for i, result in tqdm(enumerate(results)):
            word_to_find = anon_text["entities"][i]["word"]
            for proposal in result:
                word_proposed = proposal["token_str"]
                if word_to_find.lower() == word_proposed.lower():
                    print("Word:", word_to_find, ", Proposed:", word_proposed)
                    nbFound += 1
    except:
        continue

In [None]:
nbFound

In [None]:
nbFound / nbMask