In [None]:
import re
from collections import Counter

# Input text
text = """
Natural Language Processing is a field of Artificial Intelligence.
It helps computers understand human language.
Tokenization is the first step in NLP.
Embeddings convert words into numbers.
NLP is widely used in chatbots and translation systems.
"""

# 1. Sentence tokenization
sentences = re.split(r'[.!?]', text)
sentences = [s.strip() for s in sentences if s.strip()]

# 2. Word tokenization + cleaning
words = re.findall(r'\w+', text.lower())

# 3. Word frequency (features)
word_freq = Counter(words)

# 4. Score each sentence (classification-style scoring)
sentence_scores = {}

for sentence in sentences:
    score = 0
    for word in re.findall(r'\w+', sentence.lower()):
        score += word_freq[word]
    sentence_scores[sentence] = score

# 5. Select top sentences (summary)
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:2]

# Output
print("Original Text:\n", text)
print("\nSummary:\n")

for s in summary_sentences:
    print("-", s)


Original Text:
 
Natural Language Processing is a field of Artificial Intelligence.
It helps computers understand human language.
Tokenization is the first step in NLP.
Embeddings convert words into numbers.
NLP is widely used in chatbots and translation systems.


Summary:

- NLP is widely used in chatbots and translation systems
- Natural Language Processing is a field of Artificial Intelligence


In [None]:
# Simple NER using Text Classification approach

sentence = "Balaji studies at IIT Madras in India"

# Tokenization
words = sentence.split()

# Entity dictionaries (training knowledge)
persons = {"Balaji"}
organizations = {"IIT"}
locations = {"India", "Madras"}

def classify_word(word):
    if word in persons:
        return "PERSON"
    elif word in organizations:
        return "ORGANIZATION"
    elif word in locations:
        return "LOCATION"
    else:
        return "OTHER"

print("Sentence:", sentence)
print("\nNamed Entities:\n")

for word in words:
    label = classify_word(word)
    print(f"{word:10} -> {label}")


Sentence: Balaji studies at IIT Madras in India

Named Entities:

Balaji     -> PERSON
studies    -> OTHER
at         -> OTHER
IIT        -> ORGANIZATION
Madras     -> LOCATION
in         -> OTHER
India      -> LOCATION


In [None]:
import re
from collections import Counter

# Knowledge base (paragraph)
text = """
NLP stands for Natural Language Processing.
It helps computers understand human language.
Tokenization is the first step in NLP.
Embeddings convert words into numbers.
"""

# Question
question = "What is tokenization?"

# Sentence tokenization
sentences = [s.strip() for s in re.split(r'[.!?]', text) if s.strip()]

# Function to convert text to word features
def text_to_features(text):
    words = re.findall(r'\w+', text.lower())
    return Counter(words)

# Features for question
question_features = text_to_features(question)

# Score sentences (classification-like)
scores = {}

for sentence in sentences:
    sentence_features = text_to_features(sentence)
    score = 0
    for word in question_features:
        score += min(question_features[word], sentence_features.get(word, 0))
    scores[sentence] = score

# Select best answer
best_answer = max(scores, key=scores.get)

print("Question:", question)
print("\nAnswer:")
print(best_answer)


Question: What is tokenization?

Answer:
Tokenization is the first step in NLP
