# 1. Summarize the paper on BPE

The paper explores the concept of encoding rare and unknown words as sequences of subword units. The idea is grounded in the understanding that certain word categories can be translated more effectively through smaller units, such as names, compounds, and loanwords. The authors adapt a data compression method called Byte Pair Encoding (BPE) for word segmentation, which replaces the most common pair of bytes in a sequence with a single, unused byte. Unlike other compression methods, BPE symbol sequences are still interpretable as subword units, and it allows the system to create and translate new words based on these subword units. The authors use an Attention Decoder/RNNSearch as the network for Neural Machine Translation (NMT) using different vocabularies, and it was found that the BPE approach produced competitive results. The main argument is that traditional word-level NMT models struggle with the translation of rare and unknown words, and subword models can enhance the translation of these words.






# 2. In regular expressions, what does \d, \D, \w, \W, \s, \S, {n}, {n,m}, {n,}, {,m} mean?



*   \d: Matches any digit (0-9).
*   \D: Matches any non-digit character.


*  \w: Matches any alphanumeric character (a-z, A-Z, 0-9, and underscore _).

*   \W: Matches any non-alphanumeric character.

*   \s: Matches any whitespace character (space, tab, newline, etc.).

*   \S: Matches any non-whitespace character.

*   {n}: Matches exactly n occurrences of the preceding character or group.
*   {n,m}: Matches at least n and at most m occurrences of the preceding character or group.


*   {n,}: Matches at least n occurrences of the preceding character or group.


*  {,m}: Matches at most m occurrences of the preceding character or group.









# 3. Create a Python based tokenizer in NLTK that replaces contractions (I’m, You’re, didn’t) into the expanded forms (I am, You are, did not).

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Create a mapping of contractions and their expansions
contractions = {
    "I'm": "I am",
    "you're": "you are",
    "didn't": "did not",
    # Add more contractions and their expansions as needed
}

# Create a tokenizer function that replaces contractions
def tokenizer(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Iterate over the words and replace contractions
    expanded_words = []
    for word in words:
        if word in contractions:
            expanded_words.extend(contractions[word].split())
        else:
            expanded_words.append(word)

    return expanded_words



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Example usage
text = "I'm glad you're here. I didn't know that."
expanded_text = tokenizer(text)
print(expanded_text)


['I', "'m", 'glad', 'you', "'re", 'here', '.', 'I', 'did', "n't", 'know', 'that', '.']


# 4. Implement the BPE algorithm with the following interface

Your normalization should expand the contractions you implemented in problem 3.

In [None]:
class Tokenizer:
    def __init__(self, vocab_size):
        pass
    
    def normalize(self, text):
        pass
    
    def train(self, corpus):
        pass
    
    def encode(self, text):
        pass
    
    def decode(self, text):
        pass

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
from itertools import groupby

nltk.download('punkt')


class Tokenizer:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.vocab = {}
        self.inv_vocab = {}

    def normalize(self, text):
        # Replace contractions
        words = word_tokenize(text)
        expanded_words = []
        for word in words:
            if word in contractions:
                expanded_words.extend(contractions[word].split())
            else:
                expanded_words.append(word)
        return expanded_words

    def train(self, corpus):
        # Tokenize the corpus and initialize character counts
        tokenized_corpus = [self.normalize(text) for text in corpus]
        char_counts = defaultdict(int)

        # Count character occurrences
        for text in tokenized_corpus:
            for word in text:
                for char in word:
                    char_counts[char] += 1

        # Initialize the vocabulary with single characters
        self.vocab = {char: count for char, count in char_counts.items()}
        self.inv_vocab = {i: char for i, char in enumerate(self.vocab.keys())}

        # Build the vocabulary using the BPE algorithm
        for _ in range(self.vocab_size - len(self.vocab)):
            # Find the most frequent pair of consecutive characters
            pair = max(char_counts.items(), key=lambda x: x[1])
            pair_chars = pair[0]

            # Create a new merged character and update the vocabulary
            new_char = pair_chars[0] + pair_chars[1]
            self.vocab[new_char] = pair[1]
            self.inv_vocab[len(self.vocab) - 1] = new_char

            # Update the character counts
            char_counts.pop(pair_chars)
            for text in tokenized_corpus:
                new_text = []
                for word in text:
                    new_word = ''.join([c if c != pair_chars else new_char for c in word])
                    new_text.append(new_word)
                tokenized_corpus[tokenized_corpus.index(text)] = new_text

                char_counts[new_char] = 0
                for word in new_text:
                    for char in word:
                        char_counts[new_char] += 1

        print("Vocabulary Size:", len(self.vocab))

    def encode(self, text):
        tokenized_text = self.normalize(text)
        encoded_text = []
        for word in tokenized_text:
            encoded_word = [self.vocab[char] for char in word if char in self.vocab]
            encoded_text.extend(encoded_word)
        return encoded_text

    def decode(self, text):
        decoded_text = [self.inv_vocab[token] for token in text if token in self.inv_vocab]
        return decoded_text


# Create a mapping of contractions and their expansions
contractions = {
    "I'm": "I am",
    "you're": "you are",
    "didn't": "did not",
    # Add more contractions and their expansions as needed
}

# Example usage
corpus = [
    "I'm happy",
    "you're awesome",
    "didn't know that"
]

tokenizer = Tokenizer(vocab_size=10)
tokenizer.train(corpus)

text = "you're awesome, I did'nt know that"
encoded_text = tokenizer.encode(text)
decoded_text = tokenizer.decode(encoded_text)

print("Original Text:", text)
print("Encoded Text:", encoded_text)
print("Decoded Text:", ' '.join(decoded_text))


Vocabulary Size: 18
Original Text: you're awesome, I did'nt know that
Encoded Text: [2, 3, 1, 3, 1, 3, 3, 2, 3, 1, 3, 2, 3, 1, 2, 1, 2, 3, 2, 3, 1, 2, 3, 2, 3, 2, 3, 3]
Decoded Text: m h ' h ' h h m h ' h m h ' m ' m h m h ' m h m h m h h


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 5. Using the package https://www.nltk.org/api/nltk.chat.html#module-nltk.chat, you will create a regular expression based chatbot that answers the following questions.  

    a. What’s the temperature today? (check for the temperature at weather channel)
    b. What’s my zip code? (based on the person’s location)
    c. How much is $19.99 in <currency>? (You can search Google for “exchange rate between us and <currency>”)
    d. What’s the definition of <word>? (you will need to look for the definition at Wikipedia or dictionary)
    
The chatbot structure from NLTK uses static string responses, you have to modify it (as in the example below) to allow for functional objects that can parse the web, for example.

Note that you will need to process HTML to create the answers, and you will use Beautiful Soup to do that. 

https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.html?highlight=select

In [None]:
pip install geocoder

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting ratelim (from geocoder)
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.chat.util import Chat, reflections
import random
import requests
from bs4 import BeautifulSoup
import geocoder
from nltk.tokenize import word_tokenize
import re

def get_temp(prompt):
    location_url = "https://weather.com/weather/today/l/4fb2eb4b20edcb606887ba1528d1e7f0ca27f832876ed59016bbbf08547ad493"
    response = requests.get(location_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    temperature = soup.find('span', {'class': 'CurrentConditions--tempValue--MHmYY'}).get_text()
    return f"The current temperature in Santa Clara is {temperature}"

def get_zip_code(prompt):
    g = geocoder.ip('me')
    zipcode = g.postal

    return f"The current ZIP code is {zipcode}."

def google_search_url(query):
    query = query.replace(' ', '+')
    url = f"https://www.google.com/search?q={query}"
    return url

def currency_convert(prompt):
    prompt_url = google_search_url(prompt)
    response = requests.get(prompt_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    amount = soup.find('div', {'class': 'BNeawe iBp4i AP7Wnd'}).find('div', {'class': 'BNeawe iBp4i AP7Wnd'}).get_text()
    return f"The correct amount is {amount}"

def get_word(prompt):
    tokens = word_tokenize(prompt)
    index = -1
    while re.match(r"[\.\?\!);:]|mean(ing)?",tokens[index]):
        index -= 1
    return tokens[index]

def get_definition(prompt):
    word = get_word(prompt)
    api_key = "3ac25ed7-e378-4c27-aaa4-1128d7d12335"
    url = f"https://dictionaryapi.com/api/v3/references/collegiate/json/{word}?key={api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        definitions = response.json()[0].get("shortdef")
        print(f"The definition(s) of {word} is/are: ")
        for i, definition in enumerate(definitions):
            print(f"{i+1}. {definition}")
        return "I hope this answers your question."
    else:
        return "Sorry, the word could not be found in the dictionary."

responses = (
    (
        r"(hello(.*))|(good [a-zA-Z]+)|((.*)[Tt]emperature(.*))",
        (
            get_temp,
        ),
    ),
    (
        r"((.*)[zZ]ip [cC]ode(.*))|(.*)[zZ]ip(.*)|(.*)[lL]ocation(.*)|(.*)[wW]here(.*)",
        (
            get_zip_code,
        ),
    ),
    (
        r"(.*)\$?\d+(\.\d+)?%?(.*)|(.*)[dD]ollars?(.*)",
        (
            currency_convert,
        ),
    ),
    (
        r"(.*)defin(e|ition)(.*)|(.*)[Ww]ord(.*)|(.*)mean(ing)?(.*)",
        (
            get_definition,
        ),
    )
)

def respond(self, str):
    """
    Generate a response to the user input.
    :type str: str
    :param str: The string to be mapped
    :rtype: str
    """

    # check each pattern
    for (pattern, response) in self._pairs:
        match = pattern.match(str)

        # did the pattern match?
        if match:
            resp = response[0]
            resp = resp(str)
            resp = self._wildcards(resp, match)  # process wildcards

            # fix munged punctuation at the end
            if resp[-2:] == "?.":
                resp = resp[:-2] + "."
            if resp[-2:] == "??":
                resp = resp[:-2] + "?"
            return resp

chatbot = Chat(responses, reflections)
Chat.respond = respond

def chat():
    print("*" * 75)
    print("Chatbot!".center(75))
    print("*" * 75)
    print("Welcome.")

    chatbot.converse()
    
chat()

***************************************************************************
                                  Chatbot!                                 
***************************************************************************
Welcome.
>What’s the temperature today?
The current temperature in Santa Clara is 72°
>What’s my zip code?
The current ZIP code is 048508.
>How much is $19.99 in INR?
The correct amount is 1,226.98 Indian Rupee
>What’s the definition of love?
The definition(s) of love is/are: 
1. strong affection for another arising out of kinship or personal ties
2. attraction based on sexual desire : affection and tenderness felt by lovers
3. affection based on admiration, benevolence, or common interests
I hope this answers your question.


KeyboardInterrupt: ignored

# 6. Implement the spell checker for cell phones

Have you ever tried to type in something quickly and because the keyboard size in the cell phone is much smaller than your finger, it typing in the neighboring letters?

   a) You will implement the spell checker from the site: https://norvig.com/spell-correct.html

   b) You will change your code to consider that replacements would only occur to neighboring words. For example, in the picture, the letter 'u' can be replaced by 'y' or 'i'.

![Keyboard iPhone](keyboard.png)

In [None]:
import re
from collections import Counter

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('/content/drive/MyDrive/ELEN523/Lab 2/big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [None]:
print(correction('ypu'))
print(correction('jirl'))
print(correction('vacarion'))

you
girl
vacation


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 7. Implement the Weighted Mininum Edit Distance

In this problem, you will implement the weighted minimum edit distance algorithm of slide 94.

You will consider the following.

- delete and insertion cost is 1
- substitution cost is 1 if it is in adjacent in the keyboard, like in problem 6.b
- substitution cost is 2 if it is below or above, or two characters to the right or left (for example, in the example of 6.b, replacing a 'u' by a 't' or 'o' would have a cost of 2
- substitution cost is infinity if that does not apply

Run the algorithm for the two words: 

- 'caft' vs 'cat' 
- 'coffee' vs 'voffrt'

In [None]:
def keyboard_distance(char1, char2):
    """
    Compute the distance between two characters based on their position on a standard QWERTY keyboard.
    """
    keyboard = [['1', '2', '3', '4', '5', '6', '7', '8', '9', '0'],
                ['q', 'w', 'e', 'r', 't', 'y', 'u', 'i', 'o', 'p'],
                ['a', 's', 'd', 'f', 'g', 'h', 'j', 'k', 'l'],
                ['z', 'x', 'c', 'v', 'b', 'n', 'm']]

    for row in range(len(keyboard)):
        if char1 in keyboard[row]:
            pos1 = (row, keyboard[row].index(char1))
        if char2 in keyboard[row]:
            pos2 = (row, keyboard[row].index(char2))

    row_diff = abs(pos1[0] - pos2[0])
    col_diff = abs(pos1[1] - pos2[1])

    if row_diff <= 1 and col_diff <= 1:
        return 1  # Adjacent keys have a substitution cost of 1
    elif row_diff <= 2 and col_diff <= 2:
        return 2  # Keys with a Manhattan distance of 2 have a substitution cost of 2
    else:
        return float('inf')  # Non-adjacent keys have a substitution cost of infinity


def weighted_edit_distance(word1, word2):
    """
    Compute the weighted minimum edit distance between two words.
    """
    m = len(word1)
    n = len(word2)

    # Initialize the distance matrix
    distance = [[0] * (n+1) for _ in range(m+1)]

    # Initialize the first row and column
    for i in range(m+1):
        distance[i][0] = i
    for j in range(n+1):
        distance[0][j] = j

    # Compute the distance matrix
    for i in range(1, m+1):
        for j in range(1, n+1):
            if word1[i-1] == word2[j-1]:
                cost = 0  # No operation needed
            else:
                cost = keyboard_distance(word1[i-1], word2[j-1])  # Compute the substitution cost
            distance[i][j] = min(
                distance[i-1][j] + 1,  # Deletion
                distance[i][j-1] + 1,  # Insertion
                distance[i-1][j-1] + cost  # Substitution
            )

    return distance[m][n]


# Example usage
word1 = 'caft'
word2 = 'cat'
distance = weighted_edit_distance(word1, word2)
print(f"Weighted Edit Distance between '{word1}' and '{word2}': {distance}")

word1 = 'coffee'
word2 = 'voffrt'
distance = weighted_edit_distance(word1, word2)
print(f"Weighted Edit Distance between '{word1}' and '{word2}': {distance}")


Weighted Edit Distance between 'caft' and 'cat': 1
Weighted Edit Distance between 'coffee' and 'voffrt': 4
