In [1]:
#imports

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import spacy
import re
from collections import Counter
NLP = spacy.load('en_core_web_md')
nameList = []
#toy corpus
corpus = [
    
    "Mark Drakeford (born 1954) is a Welsh Labour Party politician serving as First Minister of Wales and Leader of Welsh Labour since 2018. Mark Rakeford. He previously served in the Welsh Government as Cabinet Secretary for Finance from 2016 to 2018 and Minister for Health and Social Services from 2013 to 2016. Mark Drakeford was first elected as the Member of the Senedd (MS) for Cardiff West in 2011. Mark without his last name.",
    "Boris Johnson is a British politician and writer, serving as Prime Minister of the United Kingdom and Leader of the Conservative Party since July 2019. Boris Johnson is not unlike ex-PM Tony Blair.",
    "David Basch is my name. Do you know Boris Johnson? I am not famous. Her name is Jane Goodall. I like Mark Cakeford. His name is David Jones. Mr Drakeford. And there is another guy called Joe Swanson.Also Zach Richards."
]

In [2]:
#get most popular names

def name_ranker(text):
    converted_text = NLP(text)
    for ent in converted_text.ents:
        
        if str(ent.label_) == "PERSON":
            name = str(ent.text)
            nameList.append(name)
            
    return text


            

In [3]:
#prepare df
df = pd.DataFrame(corpus, columns = ['texts'])
#apply name ranker method to corpus
df = df.apply(lambda row: name_ranker(row['texts']), axis=1)
#fit the data
tfidf = TfidfVectorizer(ngram_range=(2, 2))
tfidf_matrix = tfidf.fit_transform(nameList)
#score the names
scored_names = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names())
#melt
scored_names_listed = pd.melt(scored_names, var_name="Word", value_name="Score")
#group by name
scored_names_sorted = scored_names_listed.groupby('Word').Score.sum().reset_index()
#sort by score
scored_names_sorted.sort_values("Score", inplace=True, ascending=False)
scored_names_sorted = scored_names_sorted.reset_index()
#percent = round(0.10 * scored_bigrams_sorted.shape[0])
#take the top names and store in list
top_names = scored_names_sorted['Word'][0:3]
top_names = top_names.values.tolist()

In [4]:
#take user defined accepted names from file
user_def_names = ['jane goodall']
#append names to the top names list
top_names.append(user_def_names)

In [5]:
def named_entity_removal(text):
    converted_text = NLP(text)
    for ent in converted_text.ents:
        
        if str(ent.label_) == "PERSON" and str(ent.text).lower() not in str(top_names).lower():
            name = str(ent.text)
            text = text.replace(name, 'REDACTED[per]')

    return text

df = pd.DataFrame(corpus, columns = ['texts'])
moddf = df.apply(lambda row: named_entity_removal(row['texts']), axis=1)

In [6]:
for i in moddf:
    print(i)

Mark Drakeford (born 1954) is a Welsh Labour Party politician serving as First Minister of Wales and Leader of Welsh Labour since 2018. REDACTED[per]. He previously served in the Welsh Government as Cabinet Secretary for Finance from 2016 to 2018 and Minister for Health and Social Services from 2013 to 2016. Mark Drakeford was first elected as the Member of the Senedd (MS) for Cardiff West in 2011. Mark without his last name.
Boris Johnson is a British politician and writer, serving as Prime Minister of the United Kingdom and Leader of the Conservative Party since July 2019. Boris Johnson is not unlike ex-PM REDACTED[per].
David Basch is my name. Do you know Boris Johnson? I am not famous. Her name is Jane Goodall. I like REDACTED[per]. His name is REDACTED[per]. Mr Drakeford. And there is another guy called REDACTED[per].Also REDACTED[per].
