# Merging two tokens as one using spaCy

In [1]:
import spacy

# Load English tokenizer, tagger, parser, NER
nlp = spacy.load("en_core_web_sm")

In [2]:
# Input sentence
sentence = "Jack Will is a software engineer."

In [3]:
# Process the sentence with spaCy
doc = nlp(sentence)

# Initialize variables to store first name and last name
first_name = ""
last_name = ""

In [4]:
# Iterate through named entities
for ent in doc.ents:
    if ent.label_ == "PERSON":
        # Split full name into first name and last name
        name_parts = ent.text.split()
        if len(name_parts) >= 2:
            first_name = name_parts[0]
            last_name = name_parts[-1]
            break

In [5]:
# Merge first name and last name into a single token
merged_token = first_name + last_name

# Replace the first name and last name with the merged token in the original sentence
modified_sentence = sentence.replace(first_name + " " + last_name, merged_token)

print("Original sentence:", sentence)
print("Modified sentence:", modified_sentence)

Original sentence: Jack Will is a software engineer.
Modified sentence: JackWill is a software engineer.


# Merging two tokens as one using NLTK

In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
# Input sentence
sentence = "John Doe is a software engineer."

In [8]:
# Tokenize the sentence
tokens = word_tokenize(sentence)

# Perform PoS tagging
pos_tags = pos_tag(tokens)

In [9]:
# Modify the sentence directly (without NER)
for i, (word, tag) in enumerate(pos_tags):
  if tag in ('NNP', 'NNPS'):  # Check for proper nouns (singular and plural)
    # If it's the first name, combine with the next word (assuming last name)
    if i < len(tokens) - 1 and pos_tags[i + 1][1] in ('NNP', 'NNPS'):
      tokens[i] = word + tokens[i + 1]
      del tokens[i + 1]  # Remove the following word (assumed last name)

# Construct the modified sentence
modified_sentence = " ".join(tokens)

In [10]:
print("Original sentence:", sentence)
print("Modified sentence:", modified_sentence)

Original sentence: John Doe is a software engineer.
Modified sentence: JohnDoe is a software engineer .
