#### Step 1: Load input texts and ground truth labels

In [28]:
import json
import pandas as pd

In [None]:
# new_texts or new_texts_FN
with open('new_texts.txt', 'r') as f:
    input_texts = json.load(f)

#### Step 2: Run Presidio on input text files

In [None]:
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import NlpEngineProvider

configuration = {
    "nlp_engine_name": "spacy",
    "models": [{"lang_code": "en", "model_name": "en_core_web_trf"}],
}

provider = NlpEngineProvider(nlp_configuration=configuration)
nlp_engine = provider.create_engine()

analyzer = AnalyzerEngine(
    nlp_engine = nlp_engine,
    supported_languages=["en"]
)

In [32]:
def analyze_text(input_text):
    results_analyzed = analyzer.analyze(text=input_text,
                                        entities=['PERSON'],
                                        language="en",
                                        return_decision_process=True)
    return results_analyzed


In [None]:
detected_entities = []
for file_idx, text in enumerate(input_texts):
    print(f"Processing File {file_idx}")
    results_analyzed = analyze_text(text)

    for res in results_analyzed:
        s, e = res.start, res.end
        entity_text = input_texts[file_idx][s:e]
        detected_entities.append([file_idx, entity_text, (s, e)])

# Total number of files: 260

In [None]:
df = pd.DataFrame(detected_entities, columns=['file_idx', 'entity_text', 'positions'])
df

In [None]:
output_file = 'results/TSCC_detected_pre_FN.csv'
df.to_csv(output_file, index=False)

print(f"CSV file saved as {output_file}")