In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import re
import string

# Sample input
text = "Barack Obama and Angela Merkel met in Berlin for the G20 Summit organized by the United Nations."

# Tokenization: simple split and strip punctuation
def tokenize(text):
    tokens = re.findall(r"\b\w+(?:-\w+)*\b", text)
    return tokens

tokens = tokenize(text)
print(tokens)


['Barack', 'Obama', 'and', 'Angela', 'Merkel', 'met', 'in', 'Berlin', 'for', 'the', 'G20', 'Summit', 'organized', 'by', 'the', 'United', 'Nations']


In [2]:
# Example sets - replace these with your loaded sets
first_names = {"Barack", "Angela", "John", "Narendra", "Michael"}
last_names = {"Obama", "Merkel", "Smith", "Modi", "Johnson"}
stopwords = {"and", "the", "in", "for", "by", "of", "on", "at", "a", "an"}

def classify_token(token):
    if token.lower() in (w.lower() for w in stopwords):
        return "Stopword"
    elif token in first_names:
        return "First Name"
    elif token in last_names:
        return "Last Name"
    else:
        return "Other"

# Apply classification
classified_tokens = [(token, classify_token(token)) for token in tokens]
for token, classification in classified_tokens:
    print(f"{token}: {classification}")


Barack: First Name
Obama: Last Name
and: Stopword
Angela: First Name
Merkel: Last Name
met: Other
in: Stopword
Berlin: Other
for: Stopword
the: Stopword
G20: Other
Summit: Other
organized: Other
by: Stopword
the: Stopword
United: Other
Nations: Other


In [6]:
def name_recognition_with_context(tokens, first_names, last_names, stopwords):
    def classify_token(token):
        if token in first_names:
            return 'First Name'
        elif token in last_names:
            return 'Last Name'
        elif token.lower() in stopwords:
            return 'Stopword'
        else:
            return 'Other'

    # Step 1: Initial classification
    classifications = [classify_token(t) for t in tokens]

    # Step 2: Context-aware adjustment
    adjusted = classifications.copy()
    for i, token in enumerate(tokens):
        if classifications[i] == 'Other' and token[0].isupper():
            neighbors = []
            if i > 0:
                neighbors.append(classifications[i-1])
            if i < len(tokens) - 1:
                neighbors.append(classifications[i+1])

            if 'First Name' in neighbors or 'Last Name' in neighbors:
                adjusted[i] = 'Possible Name'

    # Step 3: Multi-word name detection
    name_labels = {'First Name', 'Last Name', 'Possible Name'}
    multiword_names = []
    current_chunk = []

    for token, label in zip(tokens, adjusted):
        if label in name_labels:
            current_chunk.append(token)
        else:
            if len(current_chunk) > 1:
                multiword_names.append(' '.join(current_chunk))
            current_chunk = []
    if len(current_chunk) > 1:
        multiword_names.append(' '.join(current_chunk))

    return list(zip(tokens, adjusted)), multiword_names


In [7]:
tokens = ['Barack', 'Obama', 'and', 'Angela', 'Merkel', 'met', 'in', 'Berlin', 'for', 'the', 'G20', 'Summit', 'organized', 'by', 'the', 'United', 'Nations']
first_names = {'Barack', 'Angela'}
last_names = {'Obama', 'Merkel'}
stopwords = {'and', 'in', 'for', 'the', 'by'}

classified_tokens, multiword_names = name_recognition_with_context(tokens, first_names, last_names, stopwords)

print(classified_tokens)
print("Multi-word names found:", multiword_names)


[('Barack', 'First Name'), ('Obama', 'Last Name'), ('and', 'Stopword'), ('Angela', 'First Name'), ('Merkel', 'Last Name'), ('met', 'Other'), ('in', 'Stopword'), ('Berlin', 'Other'), ('for', 'Stopword'), ('the', 'Stopword'), ('G20', 'Other'), ('Summit', 'Other'), ('organized', 'Other'), ('by', 'Stopword'), ('the', 'Stopword'), ('United', 'Other'), ('Nations', 'Other')]
Multi-word names found: ['Barack Obama', 'Angela Merkel']
