In [None]:
!python3 -m spacy download xx_ent_wiki_sm
import spacy
import pandas as pd
from tqdm import tqdm
import json

def load_spacy_ner(model_s: str = "xx_ent_wiki_sm"):
    return spacy.load(model_s, disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"])

model_str = 'xx_ent_wiki_sm'
ner = load_spacy_ner(model_s=model_str)


Collecting xx-ent-wiki-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.8.0/xx_ent_wiki_sm-3.8.0-py3-none-any.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m74.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xx-ent-wiki-sm
Successfully installed xx-ent-wiki-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_ent_wiki_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import json
from collections import Counter
import pandas as pd
from bs4 import BeautifulSoup

def find_word_in_url (url: str, search_list: list) -> bool:
    for search_str in search_list:
        if search_str in url:
            return True
    return False

# remove social media and music sites
def filter_websites (data: list, socialMedia: list) -> list:
    filt_data = []
    for webpage in data:
        url = webpage['url']

        # if social media website not in url add to filtered results
        if not find_word_in_url(url=url, search_list=socialMedia):
            filt_data.append(webpage)
    return filt_data

def filter_on_name(data: list, nameDict: dict) -> list:
    filt_data = []
    for website in data:
        # isolate e.g. "12.1" -> int(12)
        id_name = int(website['key'].split('.')[0])

        # retrieve search name
        name = nameDict[id_name]

        # if name in website text add to filtered results
        if name.lower() in website['full_text'].lower():
            website.update({'name': name})
            filt_data.append(website)

    return filt_data

def post_process_names_d (results:list=False) -> list:
      # remove social medias
      results = filter_websites(data=results,
                                socialMedia=social_medias)

      # remove text that don't include name from searched person
      results = filter_on_name (data=results,
                                nameDict=gt_id_name_d)

      return results



In [None]:
def parse_fulltext_names(text: str, nlp) -> list[dict]:
  results = []
  docs = [nlp(sent) for sent in text.split('.')]

  for doc in docs:
    for ent in doc.ents:
        if ent.label_ == "PER" and spacy.explain(ent.label_) == "Named person or family.":
            results.append({
                'word': ent.text,
                'label': ent.label_,
            })

  return results

def sort_ner_results_to_df(raw_data: list, nlp) -> pd.DataFrame:
    df_d = {'source_key': [],
            'source_name': [],
            'target_name': [],
            'score': [],
            'url': [],
            }

    for web_data in tqdm(raw_data):
        url = web_data['url']
        name = web_data['name']
        full_text = web_data['full_text']
        source_key = web_data['key']

        ner_results = parse_fulltext_names(text=full_text, nlp=nlp)

        for ner_result in ner_results:
            if ner_result['word'] not in name and ' ' in ner_result['word']:
                df_d['source_key'].append(source_key)
                df_d['url'].append(str(url))
                df_d['source_name'].append(name)
                df_d['target_name'].append(ner_result['word'])
                df_d['score'].append(None)

    return pd.DataFrame(df_d)


In [None]:
from random import choices

path_web_jsonl = 'gtWebText2.jsonl'
path_gt_df = 'gt_contacts.xlsx'

gt_df = pd.read_excel(path_gt_df)
gt_id_name_d = dict(zip(gt_df['id'], gt_df['name']))


social_medias = ['marktplaats', 'spotify', 'tiktok', 'linkedin', 'facebook',
                 'music.apple', 'youtube', 'soundcloud', 'play.google', 'imdb']

# open scraped websites texts
with open(path_web_jsonl, encoding='utf-8-sig') as json_file:
    # open json file
    results = [json.loads(json_str) for json_str in list(json_file)]

# filter SM, music, and movies websites
filtered_pages_l = post_process_names_d(results=results)

test_pages_l = choices(filtered_pages_l, k=10)

ner_df_spacy = sort_ner_results_to_df (raw_data=filtered_pages_l,
                                       nlp=ner)

print(type(ner_df_spacy))

print(ner_df_spacy.head(3))



100%|██████████| 1091/1091 [06:37<00:00,  2.74it/s]

<class 'pandas.core.frame.DataFrame'>
  source_key  source_name    target_name score  \
0       60.0  Tess Stepek  Subject Staff  None   
1       60.0  Tess Stepek   Hugo Damstra  None   
2       60.0  Tess Stepek  Amy Moerkerke  None   

                                                 url  
0  https://www.uu.nl/en/news/utrecht-students-rec...  
1  https://www.uu.nl/en/news/utrecht-students-rec...  
2  https://www.uu.nl/en/news/utrecht-students-rec...  





In [None]:
import re
import pandas as pd
from openpyxl.cell.cell import ILLEGAL_CHARACTERS_RE

def clean_string(s):
    if isinstance(s, str):
        return ILLEGAL_CHARACTERS_RE.sub("", s)
    return s

df_clean = ner_df_spacy.applymap(clean_string)
df_clean.to_excel('ner_persons_spacy.xlsx', index=False)

  df_clean = ner_df_spacy.applymap(clean_string)
