In [1]:
# -----------------------------------------
# TASK 1: Tokenization & Basic Preprocessing
# -----------------------------------------

import re

text = (
    "Artificial Intelligence is changing the world. "
    "NLP helps machines understand language. "
    "Students learn tokenization, POS tagging, and NER. "
    "Preprocessing improves model performance. "
    "Practice makes perfect!"
)

# Sentence Tokenization (simple split)
sentences = re.split(r'(?<=[.!?])\s+', text.strip())

# Word Tokenization
word_tokens = []
for sent in sentences:
    toks = re.findall(r"\b\w+'?\w*\b", sent)
    word_tokens.extend(toks)

# Lowercasing & punctuation removal (already clean tokens)
cleaned_tokens = [w.lower() for w in word_tokens]

print("Original Text:\n", text)
print("\nSentences:")
for s in sentences:
    print("-", s)

print("\nWord Tokens:", word_tokens
)
print("Cleaned Tokens:", cleaned_tokens)


Original Text:
 Artificial Intelligence is changing the world. NLP helps machines understand language. Students learn tokenization, POS tagging, and NER. Preprocessing improves model performance. Practice makes perfect!

Sentences:
- Artificial Intelligence is changing the world.
- NLP helps machines understand language.
- Students learn tokenization, POS tagging, and NER.
- Preprocessing improves model performance.
- Practice makes perfect!

Word Tokens: ['Artificial', 'Intelligence', 'is', 'changing', 'the', 'world', 'NLP', 'helps', 'machines', 'understand', 'language', 'Students', 'learn', 'tokenization', 'POS', 'tagging', 'and', 'NER', 'Preprocessing', 'improves', 'model', 'performance', 'Practice', 'makes', 'perfect']
Cleaned Tokens: ['artificial', 'intelligence', 'is', 'changing', 'the', 'world', 'nlp', 'helps', 'machines', 'understand', 'language', 'students', 'learn', 'tokenization', 'pos', 'tagging', 'and', 'ner', 'preprocessing', 'improves', 'model', 'performance', 'practice'

In [2]:
# -----------------------------------------
# TASK 2: POS Tagging (NLTK or fallback)
# -----------------------------------------

import re

# Using the cleaned tokens from Task 1
tokens = cleaned_tokens

pos_tags = []
nltk_warning = None
used_nltk = False

# Try NLTK first
try:
    import nltk

    try:
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('taggers/averaged_perceptron_tagger')
    except LookupError:
        nltk_warning = "NLTK models not found, using fallback POS tagger."

    if nltk_warning is None:
        pos_tags = nltk.pos_tag(tokens)
        used_nltk = True

except Exception as e:
    nltk_warning = f"NLTK not available: {e}"

# Fallback POS Tagging (rule-based)
if not used_nltk:
    common_verbs = {
        "is","are","was","were","be","helps",
        "help","learn","learns","changing","makes","improves"
    }

    pos_tags = []
    for w in tokens:
        if w in common_verbs or w.endswith("ing") or w.endswith("ed"):
            pos_tags.append((w, "VB"))
        elif w.endswith("ly"):
            pos_tags.append((w, "RB"))
        elif w.endswith("al") or w.endswith("ive") or w.endswith("ous"):
            pos_tags.append((w, "JJ"))
        else:
            pos_tags.append((w, "NN"))

# Counting
nouns = sum(1 for _,t in pos_tags if t.startswith("NN"))
verbs = sum(1 for _,t in pos_tags if t.startswith("VB"))
adjectives = sum(1 for _,t in pos_tags if t.startswith("JJ"))

print("POS Tags:")
for p in pos_tags:
    print(p)

print(f"\nCounts -> Nouns: {nouns}, Verbs: {verbs}, Adjectives: {adjectives}")

if nltk_warning:
    print("\nNOTE:", nltk_warning)


POS Tags:
('artificial', 'JJ')
('intelligence', 'NN')
('is', 'VB')
('changing', 'VB')
('the', 'NN')
('world', 'NN')
('nlp', 'NN')
('helps', 'VB')
('machines', 'NN')
('understand', 'NN')
('language', 'NN')
('students', 'NN')
('learn', 'VB')
('tokenization', 'NN')
('pos', 'NN')
('tagging', 'VB')
('and', 'NN')
('ner', 'NN')
('preprocessing', 'VB')
('improves', 'VB')
('model', 'NN')
('performance', 'NN')
('practice', 'NN')
('makes', 'VB')
('perfect', 'NN')

Counts -> Nouns: 16, Verbs: 8, Adjectives: 1

NOTE: NLTK models not found, using fallback POS tagger.


In [3]:
# -----------------------------------------
# TASK 3: Named Entity Recognition (NER)
# -----------------------------------------

import re

text_for_ner = (
    "Artificial Intelligence is changing the world. "
    "NLP helps machines understand language. "
    "Students learn tokenization, POS tagging, and NER."
)

ner_results = []
used_spacy = False
spacy_warning = None

# Try spaCy first
try:
    import spacy
    try:
        nlp = spacy.load("en_core_web_sm")
        doc = nlp(text_for_ner)
        for ent in doc.ents:
            ner_results.append((ent.text, ent.label_))
        used_spacy = True
    except Exception as e:
        spacy_warning = f"spaCy model not available: {e}"

except Exception as e:
    spacy_warning = f"spaCy not installed: {e}"

# Fallback NER if spaCy unavailable
if not used_spacy:
    fallback_entities = []

    # Simple rule: extract Capitalized words or phrases
    caps = re.findall(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\b', text_for_ner)

    org_list = {"NLP", "Artificial Intelligence"}

    for c in caps:
        if c in org_list:
            fallback_entities.append((c, "ORG"))
        elif c == "Students":
            fallback_entities.append((c, "PERSON"))
        else:
            fallback_entities.append((c, "MISC"))

    # Remove duplicates
    ner_results = list(dict.fromkeys(fallback_entities))

print("Named Entities Found:")
for ent in ner_results:
    print(ent)

if spacy_warning:
    print("\nNOTE:", spacy_warning)


Named Entities Found:
('Artificial Intelligence', 'ORG')
('Students', 'PERSON')

NOTE: spaCy model not available: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.


In [4]:
# -----------------------------------------
# TASK 4: Byte Pair Encoding (BPE)
# -----------------------------------------

def bpe_merge_steps(sentence, iterations=3):
    # Split into words, then split words into chars + </w>
    words = sentence.split()
    tokens = [" ".join(list(w)) + " </w>" for w in words]
    tokens = [t.split() for t in tokens]  # list of lists

    merge_history = []

    for it in range(iterations):
        # Count all adjacent pairs
        pair_freq = {}
        for token in tokens:
            for i in range(len(token) - 1):
                pair = (token[i], token[i+1])
                pair_freq[pair] = pair_freq.get(pair, 0) + 1

        if not pair_freq:
            break

        # Find most frequent pair
        best_pair = max(pair_freq, key=pair_freq.get)
        freq = pair_freq[best_pair]

        # Merge step
        new_tokens = []
        for token in tokens:
            new_token = []
            i = 0
            while i < len(token):
                # If a pair matches, merge them
                if i < len(token) - 1 and token[i] == best_pair[0] and token[i+1] == best_pair[1]:
                    new_token.append(token[i] + token[i+1])
                    i += 2
                else:
                    new_token.append(token[i])
                    i += 1
            new_tokens.append(new_token)

        merge_history.append({
            "iteration": it + 1,
            "merged_pair": best_pair,
            "frequency": freq,
            "tokens_after_merge": [" ".join(t) for t in new_tokens]
        })

        tokens = new_tokens

    # Initial token list
    initial = [" ".join(list(w)) + " </w>" for w in sentence.split()]

    return {"initial": initial, "history": merge_history}


# ----- Run BPE on sample text -----
sentence = "lower newer slower"
bpe_result = bpe_merge_steps(sentence, iterations=3)

print("Initial Tokens:")
print(bpe_result["initial"])

print("\nBPE Merge History:")
for h in bpe_result["history"]:
    print(f"\nIteration {h['iteration']}:")
    print("Merged Pair:", h["merged_pair"])
    print("Frequency:", h["frequency"])
    print("Tokens After Merge:", h["tokens_after_merge"])


Initial Tokens:
['l o w e r </w>', 'n e w e r </w>', 's l o w e r </w>']

BPE Merge History:

Iteration 1:
Merged Pair: ('w', 'e')
Frequency: 3
Tokens After Merge: ['l o we r </w>', 'n e we r </w>', 's l o we r </w>']

Iteration 2:
Merged Pair: ('we', 'r')
Frequency: 3
Tokens After Merge: ['l o wer </w>', 'n e wer </w>', 's l o wer </w>']

Iteration 3:
Merged Pair: ('wer', '</w>')
Frequency: 3
Tokens After Merge: ['l o wer</w>', 'n e wer</w>', 's l o wer</w>']


In [5]:
# -----------------------------------------
# TASK 5: Minimum Edit Distance (MED)
# -----------------------------------------

def min_edit_distance(a, b):
    la = len(a)
    lb = len(b)

    # Create DP matrix
    dp = [[0] * (lb + 1) for _ in range(la + 1)]

    # Base cases
    for i in range(la + 1):
        dp[i][0] = i
    for j in range(lb + 1):
        dp[0][j] = j

    # Fill matrix
    for i in range(1, la + 1):
        for j in range(1, lb + 1):
            cost = 0 if a[i-1] == b[j-1] else 1
            dp[i][j] = min(
                dp[i-1][j] + 1,      # deletion
                dp[i][j-1] + 1,      # insertion
                dp[i-1][j-1] + cost  # substitution
            )

    # Backtrace operations
    i, j = la, lb
    operations = []

    while i > 0 or j > 0:
        if i > 0 and dp[i][j] == dp[i-1][j] + 1:
            operations.append(("delete", a[i-1]))
            i -= 1
        elif j > 0 and dp[i][j] == dp[i][j-1] + 1:
            operations.append(("insert", b[j-1]))
            j -= 1
        else:
            if a[i-1] == b[j-1]:
                operations.append(("match", a[i-1]))
            else:
                operations.append(("substitute", (a[i-1], b[j-1])))
            i -= 1
            j -= 1

    operations.reverse()
    return dp, dp[la][lb], operations


# ---- Example Run ----
word1 = "sunday"
word2 = "saturday"

dp_matrix, distance, ops = min_edit_distance(word1, word2)

print("Word 1:", word1)
print("Word 2:", word2)
print("\nMinimum Edit Distance:", distance)

print("\nOperations:")
for op in ops:
    print(op)

print("\nDP Matrix:")
for row in dp_matrix:
    print(row)


Word 1: sunday
Word 2: saturday

Minimum Edit Distance: 3

Operations:
('match', 's')
('insert', 'a')
('insert', 't')
('match', 'u')
('substitute', ('n', 'r'))
('match', 'd')
('match', 'a')
('match', 'y')

DP Matrix:
[0, 1, 2, 3, 4, 5, 6, 7, 8]
[1, 0, 1, 2, 3, 4, 5, 6, 7]
[2, 1, 1, 2, 2, 3, 4, 5, 6]
[3, 2, 2, 2, 3, 3, 4, 5, 6]
[4, 3, 3, 3, 3, 4, 3, 4, 5]
[5, 4, 3, 4, 4, 4, 4, 3, 4]
[6, 5, 4, 4, 5, 5, 5, 4, 3]
