In [None]:
!python -m spacy download fr_core_news_lg

In [None]:
import spacy
import pandas as pd
import json
import os
import re
import unicodedata

# Load the French spaCy model
nlp = spacy.load("fr_core_news_lg")

# Folder containing your articles
folder_path = r"C:\Users\bauke\OneDrive - KU Leuven\Documents\Documenten\5 digital humanities\stage\articles-verbetering"

# Function to read article content
def read_article(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

# Preprocessing function 
def preprocess_text(text):
    text = unicodedata.normalize("NFKC", text) 
    text = text.lower()  
    text = re.sub(r"\s+", " ", text)  
    text = re.sub(r"[^\w\s]", "", text)  
    return text.strip()

# Function to extract named entities (people & places)
def extract_entities(text):
    doc = nlp(text)
    people = [ent.text for ent in doc.ents if ent.label_ == "PER"]  # Person
    places = [ent.text for ent in doc.ents if ent.label_ in ["LOC", "GPE"]]  # Locations
    return {"people": list(set(people)), "places": list(set(places))}

# Load articles into a DataFrame
data = []
for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
        file_path = os.path.join(folder_path, filename)
        text = read_article(file_path)
        clean_text = preprocess_text(text)
        entities = extract_entities(clean_text)
        data.append({
            "filename": filename,
            "preprocessed_text": clean_text,
            "people": entities["people"],
            "places": entities["places"]
        })

df = pd.DataFrame(data)


df.to_csv("spacy_ner_results.csv", index=False)

