# Replacing all the nouns by UNKNWON using NLTK

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk #module

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker') #pre-trained model (Maximum Entropy (MaxEnt) classifier)
nltk.download('words') #spell checking and wordsense

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [2]:
def replace_names(text):
    # Tokenize the text
    words = word_tokenize(text)

    # Perform Part-of-Speech tagging
    pos_tags = pos_tag(words)

    # Perform Named Entity Recognition (NER)
    named_entities = ne_chunk(pos_tags)

    # Replace names with UNKNOWN
    replaced_text = []
    for entity in named_entities:
        if isinstance(entity, tuple):
            replaced_text.append(entity[0])
        else:
            if entity.label() == 'PERSON':
                replaced_text.append('UNKNOWN')
            else:
                replaced_text.extend([word[0] for word in entity])

    # Combine words into a single string
    replaced_text = ' '.join(replaced_text)

    return replaced_text

In [3]:
# Sample news article
news_article = """
John Sam, a renowned scientist, won the Nobel Prize in Physics. He was born in New York City.
The CEO of the company, Jane Doe, announced record-breaking profits for the quarter.
Emma Watson, the famous actress, starred in the new blockbuster movie.
"""

# Replace names in the news article
replaced_article = replace_names(news_article)
print(replaced_article)

UNKNOWN , a renowned scientist , won the Nobel Prize in Physics . He was born in New York City . The CEO of the company , UNKNOWN , announced record-breaking profits for the quarter . UNKNOWN , the famous actress , starred in the new blockbuster movie .


# Replacing all the nouns by UNKNWON using spaCy

In [4]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

In [5]:
def replace_names(text):
    # Process the text with spaCy
    doc = nlp(text)

    # Initialize variables
    replaced_text = []
    is_replacing = False
    replaced_names = set()
    current_name = []

    # Iterate through tokens
    for token in doc:
        if token.ent_type_ == "PERSON":
            if not is_replacing:
                is_replacing = True
            current_name.append(token.text)
        else:
            if is_replacing:
                # Combine first name and last name
                combined_name = " ".join(current_name)
                if combined_name not in replaced_names:
                    replaced_text.append("UNKNOWN")
                    replaced_names.add(combined_name)
                else:
                    replaced_text.append(" ".join(current_name))
                current_name = []
                is_replacing = False
            replaced_text.append(token.text)

    # Combine words into a single string
    replaced_text = " ".join(replaced_text)

    return replaced_text

In [6]:
# Sample news article
news_article = """
John Smith, a renowned scientist, won the Nobel Prize in Physics. He was born in New York City.

The CEO of the company, Jane Doe, announced record-breaking profits for the quarter.

Emma Watson, the famous actress, starred in the new blockbuster movie.
"""

# Replace names in the news article
replaced_article = replace_names(news_article)
print(replaced_article)


 UNKNOWN , a renowned scientist , won the Nobel Prize in Physics . He was born in New York City . 

 The CEO of the company , UNKNOWN , announced record - breaking profits for the quarter . 

 UNKNOWN , the famous actress , starred in the new blockbuster movie . 

