In [1]:
# Install spacy-entity-linker if not already installed
try:
    import spacy_entity_linker
except ImportError:
    import sys
    !{sys.executable} -m pip install spacy-entity-linker
    import spacy_entity_linker

In [2]:
import pandas as pd
import spacy
import spacy_entity_linker

# python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_sm")
classifier = spacy_entity_linker.EntityClassifier.EntityClassifier()

# Load dataset
df = pd.read_csv("entities_duplicates_removed0709.csv")

# Here initially place the Function to filter out fully capitalized lines, 
# but since I've replace the input csv file into the filtered dataset that only has extracted names, this former step was deleted


# Initialize output list
linked_entities = []

# Loop through the dataset to remove "the"
for index, row in df.iterrows():
    original_name = row['Pattern']
    cleaned_name = original_name
    if original_name.lower().startswith("the "): # 可以删掉lower（）
        cleaned_name = original_name[4:]

    doc = nlp(cleaned_name)
    found = False

    # Create spaCy doc and attempt linking (same method with the spacy-entity-linker.ipynb)
    for ent in doc.ents:
        if ent.label_ == "ORG":
            found = True
            termCandidate = spacy_entity_linker.TermCandidate.TermCandidate(ent)
            entityCandidates = termCandidate.get_entity_candidates()
            if len(entityCandidates) > 0:
                entity = classifier(entityCandidates)
            else:
                entity = None

            linked_entities.append({
                "Original Pattern": original_name,
                "Cleaned Pattern": cleaned_name,
                "Linked Entity": entity
            })

    if not found: #先run ner，得到entity list，把sheet基础上去掉the； improve entity linking function, 
        # Still include it in the output, even if no ORG was detected 
        # to avoid: if no ORG entity was detected in the cleaned org_name (after removing "the"),that row will be skipped
        linked_entities.append({
            "Original Pattern": original_name,
            "Cleaned Pattern": cleaned_name,
            "Linked Entity": None
        })

output_df = pd.DataFrame(linked_entities)
output_df.to_csv("linking_results_with_the_removed.csv", index=False)
output_df.head()

Unnamed: 0,Original Pattern,Cleaned Pattern,Linked Entity
0,the Board of Environmental Protection,Board of Environmental Protection,
1,The Department of Environmental Protection,Department of Environmental Protection,environment ministry
2,the Sierra Club,Sierra Club,Sierra Club
3,the Maine Public Health Association,Maine Public Health Association,
4,the U.S. Environmental Protection Agency,U.S. Environmental Protection Agency,


In [3]:
# check if linking accuracy is improved
output_df

Unnamed: 0,Original Pattern,Cleaned Pattern,Linked Entity
0,the Board of Environmental Protection,Board of Environmental Protection,
1,The Department of Environmental Protection,Department of Environmental Protection,environment ministry
2,the Sierra Club,Sierra Club,Sierra Club
3,the Maine Public Health Association,Maine Public Health Association,
4,the U.S. Environmental Protection Agency,U.S. Environmental Protection Agency,
...,...,...,...
187,Federal Chamber of Automotive Industries,Federal Chamber of Automotive Industries,
188,FCAI,FCAI,
189,EV Council,EV Council,
190,S&P Global Research,S&P Global Research,


In [4]:
unlinked_count = output_df ["Linked Entity"].isna().sum()
linked_count = output_df["Linked Entity"].notna().sum()

print(f"Originally unlinked: 192")
print(f"Now linked after removing 'The': {linked_count}")
print(f"Still unlinked: {unlinked_count}")


Originally unlinked: 192
Now linked after removing 'The': 48
Still unlinked: 144
