In [10]:
from pathlib import Path
import sys
import pandas as pd
from pathlib import Path
import os

current_dir = Path(os.path.dirname(os.path.realpath("__file__")))
project_root = current_dir.parent.parent
text_files_dir = project_root / "text_files"

from ner_extractor import ExtractEntities, aggregate_entities

# Initialize the entity extractor
extractor = ExtractEntities(model="ctrlbuzz/bert-addresses")

# Filter files from 2018 onwards
file_paths = [os.path.join(text_files_dir, f) for f in os.listdir(text_files_dir) if f.startswith(("2018-", "2019-", "2020-", "2021-", "2022-", "2023-"))]

In [12]:
# Process each file and aggregate entities
dataframes = []
for file_path in file_paths:
    report_number = os.path.basename(file_path).split('.')[0]
    with open(file_path, 'r') as file:
        text = file.read()
    df = extractor.create_dataframe(report_number, text, counts=True)
    aggregated_df = aggregate_entities(df)
    dataframes.append(aggregated_df)

In [13]:
# Concatenate all dataframes into one
final_df = pd.concat(dataframes, ignore_index=True)
final_df

Unnamed: 0,entity,label,report_number
0,COPA,ORG,2018-1088074
1,Ernesto Guzman - Sanchez,PER,2018-1088074
2,GPA,PER,2018-1088074
3,Sanchez,PER,2018-1088074
4,C,ORG,2018-1088074
...,...,...,...
5537,C PD,ORG,2021-0000885
5538,Chan,PER,2021-0000885
5539,Zeman,PER,2021-0000885
5540,Al,PER,2021-0000885


In [15]:
from fuzzywuzzy import process
from transformers import pipeline

# Filtering for 'PER' and 'addr' labels
relevant_entities_df = final_df[final_df['label'].isin(['PER', 'addr'])]

# Load the CSV file with true labeled data
true_labels_df = pd.read_csv('../combined_entities.csv')

In [29]:
# Filter out entities based on character length and capitalization
filtered_df = relevant_entities_df[(relevant_entities_df['entity'].str.len() >= 4) &
                                   (relevant_entities_df['entity'].str.len() <= 27) &
                                   (~relevant_entities_df['entity'].str.isupper())]


In [30]:
filtered_df.head(20)

Unnamed: 0,entity,label,report_number
1,Ernesto Guzman - Sanchez,PER,2018-1088074
3,Sanchez,PER,2018-1088074
10,Lukasz Gorczynski,PER,2019-1092446
11,Gorc,PER,2019-1092446
12,Matthew Scott,PER,2019-1092446
14,Hoffman,PER,2019-1092446
15,Star White,PER,2019-1092446
16,Scott Star,PER,2019-1092446
17,Gregory,PER,2019-1092446
23,Ngitamiita,PER,2019-0002426


In [31]:
sampled_df = filtered_df.sample(n=500, random_state=1)  # Sample 500 rows

In [28]:
# Iterate over entities in final_df and find best match in true_labels_df
for index, row in sampled_df.iterrows():
    entity = row['entity']
    best_match = process.extractOne(entity, true_labels_df['Entity'])
    if best_match[1] > 90:  # If similarity score is above 90
        print(f"Entity {entity} matched with {best_match[0]} with score {best_match[1]}")


Entity Ernesto Guzman - Sanchez matched with Ernesto Guzman-Sanchez with score 96
Entity Lukasz Gorczynski matched with Lukasz Gorczynski with score 100
Entity Matthew Scott matched with Matthew Scott with score 100
Entity Hoffman matched with R Hoffman with score 95
Entity Gregory matched with R Gregory with score 95
Entity  W Belmont matched with W Belmont Ave with score 95
Entity Anthony Nicpo matched with Anthony Nicpon with score 96
Entity Brian Conliskp matched with Brian Conlisk with score 96
Entity Raul Alvarez matched with Raul Alvarez with score 100
Entity Harry Vazquez Jr matched with Harry Vazquez with score 95
Entity Robert Woods matched with Robert Woods with score 100
Entity Raymond Morris matched with Raymond Morris with score 100
Entity Michael VanBoldrik matched with Michael Vanboldrik with score 100
Entity Javier Celio matched with Javier Celio with score 100


KeyboardInterrupt: 

In [36]:
from fuzzywuzzy import fuzz

# Assume relevant_entities_df and true_labels_df are your dataframes
# Mapping final_df labels to true_labels_df labels
label_mapping = {'PER': 'PERSON', 'addr': 'LOC'}

matches = 0

for index, row in sampled_df.iterrows():
    entity = row['entity']
    label = row['label']
    max_score = 0
    
    for _, true_row in true_labels_df[true_labels_df['Label'] == label_mapping[label]].iterrows():
        score = fuzz.ratio(entity.lower(), true_row['Entity'].lower())
        max_score = max(max_score, score)

    if max_score >= 85:  # Assuming 85% as the threshold
        matches += 1

accuracy = matches / len(sampled_df) * 100
print(f"Accuracy: {accuracy}%")

Accuracy: 50.4%


In [37]:
matches = 0

for index, row in sampled_df.iterrows():
    entity = row['entity']
    label = row['label']
    max_score = 0
    
    for _, true_row in true_labels_df[true_labels_df['Label'] == label_mapping[label]].iterrows():
        score = fuzz.ratio(entity.lower(), true_row['Entity'].lower())
        max_score = max(max_score, score)

    if max_score >= 80:  # Assuming 80% as the threshold
        matches += 1

accuracy = matches / len(sampled_df) * 100
print(f"Accuracy: {accuracy}%")

Accuracy: 55.800000000000004%
