In [1]:
import pandas as pd
import re
import unicodedata
import json
from pathlib import Path
from transformers import AutoTokenizer, pipeline
from transformers import CamembertTokenizer

In [2]:
input_folder = Path(r"C:\Users\bauke\OneDrive - KU Leuven\Documents\Documenten\5 digital humanities\stage\articles-verbetering")

data = ({"filename": [], "text": []})

for file_path in input_folder.glob("*.txt"):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read()
    
    data["filename"].append(file_path.name)
    data["text"].append(text)

df = pd.DataFrame(data)

print(df.head())


                                            filename  \
0  BE-KBR00_12126493_19340520_00_00_00_0_01_0001_...   
1  BE-KBR00_12126493_19340520_00_00_00_0_01_0001_...   
2  BE-KBR00_12126493_19340708_00_00_00_0_01_0024_...   
3  BE-KBR00_12126493_19340909_00_00_00_0_01_0002_...   
4  BE-KBR00_12126493_19341223_00_00_00_0_01_0001_...   

                                                text  
0  (Photo E. Delhaes,)\nLe Speaker humoristique\n...  
1  PROGRAMME\n13 heures — Concert de gala présent...  
2  Un prince des humoristes\nMarcel Antoine,\nle ...  
3  Radio qui rit\nAu hasard du VIe Salon\nPortrai...  
4  LES SUN'KIST'S\nChanteurs, danseurs et guitari...  


In [3]:
def preprocess_text(text):
    text = unicodedata.normalize("NFKC", text)  
    text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)  
    text = re.sub(r"\n+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()  
    text = re.sub(r"(\w)([.,;!?])", r"\1 \2", text)  
    return text

df["cleaned_text"] = df["text"].apply(preprocess_text)

In [6]:
tokenizer = CamembertTokenizer.from_pretrained("Jean-Baptiste/camembert-ner")
ner_pipeline = pipeline("ner", model="Jean-Baptiste/camembert-ner", tokenizer=tokenizer, aggregation_strategy="simple")

def extract_entities(text):
    results = ner_pipeline(text)

    people = [ent["word"] for ent in results if ent["entity_group"] == "PER"]
    locations = [ent["word"] for ent in results if ent["entity_group"] == "LOC"]
    organisations = [ent["word"] for ent in results if ent["entity_group"] == "ORG"]

    return {"people": people, "locations": locations, "organisations": organisations }

df["ner_results"] = df["cleaned_text"].apply(extract_entities)

Device set to use cpu


In [7]:
json_output = {}

for _, row in df.iterrows():
    json_output[row["filename"]] = {
        "text": row["text"],  
        "cleaned_text": row["cleaned_text"],  
        "people": row["ner_results"]["people"],  
        "locations": row["ner_results"]["locations"],  
        "organisations": row["ner_results"]["organisations"]
    }

output_file = "ner_results3.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(json_output, f, ensure_ascii=False, indent=4)

print(f"NER results saved to {output_file}")

NER results saved to ner_results3.json


In [11]:
flat_json_output = [
    {
        "filename": key,
        "text": value["text"],
        "cleaned_text": value["cleaned_text"],
        "people": ", ".join(set(value["people"])), 
        "locations": ", ".join(set(value["locations"])),
        "organisations": ", ".join(set(value["organisations"]))  
    }
    for key, value in json_output.items()
]
import json
output_file2 = "flat_ner_results.json"
with open(output_file2, "w", encoding="utf-8") as f:
    json.dump(flat_json_output, f, indent=4, ensure_ascii=False)

print(f"flat NER results saved to {output_file}")

flat NER results saved to ner_results3.json
