In [1]:
import pandas as pd
import spacy
import spacy_entity_linker

# python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_sm")

# Load dataset
df = pd.read_csv("emission_rule_news_steven.csv")

# Function to filter out fully capitalized lines
def filter_capitalized_paragraphs(text):
    # Split the text into lines for processing
    lines = text.split("\n")
    # Filter out lines that are fully capitalized and have more than 2 words
    filtered_lines = [
        line for line in lines
        if not (line.isupper() and len(line.split()) > 2)
    ]
    # Rejoin filtered lines into a single string
    return " ".join(filtered_lines)

df['filtered_maintext'] = df['maintext'].astype(str).apply(filter_capitalized_paragraphs)

# Initialize a dictionary to store entities
extracted_entities = []

for index, row in df.iterrows():
    article_id = row['id']  # Get the Article ID
    article_text = row['filtered_maintext']  # Get the filtered text
    publish_date = row['date_publish']
    url = row['news_url']

    # Process the article
    doc = nlp(article_text)

    # collecting the entities in array
    entities = []
    # classifier
    classifier = spacy_entity_linker.EntityClassifier.EntityClassifier()
    for ent in doc.ents:
        if ent.label_ in {"ORG"}:
            # build a term candidate (a simple span)
            termCandidate = spacy_entity_linker.TermCandidate.TermCandidate(ent)
            # get all the candidates for the term
            entityCandidates = termCandidate.get_entity_candidates()
            if len(entityCandidates) > 0:
                # select the best candidate
                entity = classifier(entityCandidates)
                # entity.span.sent._.linkedEntities.append(entity) # --> cannot if the attribute is not registered
                entities.append(entity)
            else:
                entity = None
            # print(f'SpaCy: {(ent.text + " " + ent.label_).ljust(40)}spaCy-entity-linker: {entity}')

            extracted_entities.append({
                "Article ID": article_id,  # Add the Article ID
                "Label": ent.label_,
                "Pattern": ent.text,
                "Entity": entity,
                "Publish Date": publish_date,
                "Article URL": url
            })
            
output_df = pd.DataFrame(extracted_entities)

# Save the DataFrame to a CSV file
output_file_path = "emission_extracted_entities_org_linker.csv"
output_df.to_csv(output_file_path, index=False)

In [2]:
output_df

Unnamed: 0,Article ID,Label,Pattern,Entity,Publish Date,Article URL
0,32,ORG,U.S. Postal Service,,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...
1,32,ORG,USPS,,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...
2,32,ORG,SEIS,Seismic Experiment for Interior Structure,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...
3,32,ORG,Next Generation Delivery Vehicles,,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...
4,32,ORG,SEIS,Seismic Experiment for Interior Structure,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...
...,...,...,...,...,...,...
59804,91445,ORG,the White House,White House,2023-09-15 12:50:33,https://www.oann.com/newsroom/house-passes-bil...
59805,91445,ORG,Senate,United States Senate,2023-09-15 12:50:33,https://www.oann.com/newsroom/house-passes-bil...
59806,91445,ORG,Administration,receivership,2023-09-15 12:50:33,https://www.oann.com/newsroom/house-passes-bil...
59807,91445,ORG,EPA,United States Environmental Protection Agency,2023-09-15 12:50:33,https://www.oann.com/newsroom/house-passes-bil...


In [3]:
# POST-PROCESS: try the "remove leading 'the '" fix only on rows with Entity == None ---


# Load model and linker
nlp = spacy.load("en_core_web_sm")
classifier = spacy_entity_linker.EntityClassifier.EntityClassifier()

# Load the baseline output produced by the original pipeline
baseline_path = "emission_extracted_entities_org_linker.csv"
df0 = pd.read_csv(baseline_path)

# Refinement: case-insensitive strip of leading "the "
def strip_leading_the(name: str) -> str:
    if not isinstance(name, str):
        return name
    s = name.strip()
    return s[4:] if s[:4].lower() == "the " else s

# Prepare a working copy and new columns
df = df0.copy()
df["Cleaned Pattern"] = df["Pattern"]
df["Entity_TheFix"] = df["Entity"]

# Only work on rows that failed to link in baseline
mask = df["Entity"].isna()

improved = 0
for i, row in df.loc[mask].iterrows():
    original = row["Pattern"]
    cleaned = strip_leading_the(original)
    df.at[i, "Cleaned Pattern"] = cleaned

    # Try linking the cleaned name
    entity_fix = None
    doc = nlp(cleaned)
    for ent in doc.ents:
        if ent.label_ == "ORG":
            tc = spacy_entity_linker.TermCandidate.TermCandidate(ent)
            cands = tc.get_entity_candidates()
            entity_fix = classifier(cands) if cands else None
            break

    if entity_fix is not None:
        df.at[i, "Entity_TheFix"] = entity_fix
        improved += 1

# Print a tiny summary
baseline_unlinked = df0["Entity"].isna().sum()
thefix_unlinked   = df["Entity_TheFix"].isna().sum()
print(f"Baseline unlinked (Entity == None): {baseline_unlinked}")
print(f"Unlinked after 'the' fix (Entity_TheFix == None): {thefix_unlinked}")
print(f"Newly linked due to 'the' fix: {improved}")

# Save the comparison-enhanced file
df.to_csv("emission_extracted_entities_org_linker_thefix.csv", index=False)


Baseline unlinked (Entity == None): 17970
Unlinked after 'the' fix (Entity_TheFix == None): 13483
Newly linked due to 'the' fix: 4487


In [4]:
df

Unnamed: 0,Article ID,Label,Pattern,Entity,Publish Date,Article URL,Cleaned Pattern,Entity_TheFix
0,32,ORG,U.S. Postal Service,,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...,U.S. Postal Service,
1,32,ORG,USPS,,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...,USPS,
2,32,ORG,SEIS,Seismic Experiment for Interior Structure,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...,SEIS,Seismic Experiment for Interior Structure
3,32,ORG,Next Generation Delivery Vehicles,,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...,Next Generation Delivery Vehicles,
4,32,ORG,SEIS,Seismic Experiment for Interior Structure,2023-07-26 10:42:02,https://www.commondreams.org/opinion/usps-elec...,SEIS,Seismic Experiment for Interior Structure
...,...,...,...,...,...,...,...,...
59804,91445,ORG,the White House,White House,2023-09-15 12:50:33,https://www.oann.com/newsroom/house-passes-bil...,the White House,White House
59805,91445,ORG,Senate,United States Senate,2023-09-15 12:50:33,https://www.oann.com/newsroom/house-passes-bil...,Senate,United States Senate
59806,91445,ORG,Administration,receivership,2023-09-15 12:50:33,https://www.oann.com/newsroom/house-passes-bil...,Administration,receivership
59807,91445,ORG,EPA,United States Environmental Protection Agency,2023-09-15 12:50:33,https://www.oann.com/newsroom/house-passes-bil...,EPA,United States Environmental Protection Agency
