# Tokenization

In [8]:
import nltk
nltk.download('punkt_tab')  # required for tokenization
nltk.download('stopwords')   # required to remove stop words

[nltk_data] Downloading package punkt_tab to /home/dsm/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dsm/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
import nltk

sentence = "Don't you love NLP? It's amazing!"

words = nltk.word_tokenize(sentence)
sentences = nltk.sent_tokenize(sentence)

print(words)
print(sentences)

['Do', "n't", 'you', 'love', 'NLP', '?', 'It', "'s", 'amazing', '!']
["Don't you love NLP?", "It's amazing!"]


# Text Preprocessing

In [13]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

text = "I love Natural Language Processing!"
tokens = nltk.word_tokenize(text.lower())

stop_words = set(stopwords.words('english'))
filtered = [word for word in tokens if word not in stop_words]

print(filtered)  
# ['love', 'natural', 'language', 'processing', '!']

['love', 'natural', 'language', 'processing', '!']


## Context free Grammar

In [None]:
import nltk

# Define some formal rules (formal grammar)
grammar = nltk.CFG.fromstring("""
    S -> NP VP

    AP -> A | A AP
    NP -> N | D NP | AP NP | N PP
    PP -> P NP
    VP -> V | V NP | V NP PP

    A -> "big" | "blue" | "small" | "dry" | "wide"
    D -> "the" | "a" | "an"
    N -> "she" | "city" | "car" | "street" | "dog" | "binoculars"
    P -> "on" | "over" | "before" | "below" | "with"
    V -> "saw" | "walked"
""")

parser = nltk.ChartParser(grammar)

# sentence = input("Sentence: ").split()
sentence = "she saw a small dog".split()
try:
    for tree in parser.parse(sentence):
        tree.pretty_print()
        tree.draw()
        break  # print only a single tree
except ValueError:
    print("No parse tree possible.")

         S                   
  _______|___                 
 |           VP              
 |    _______|____            
 |   |            NP         
 |   |    ________|____       
 |   |   |             NP    
 |   |   |         ____|___   
 NP  |   |        AP       NP
 |   |   |        |        |  
 N   V   D        A        N 
 |   |   |        |        |  
she saw  a      small     dog



## One Hot Encoding

In [28]:
# Creating one-hot encodings for a small vocabulary
vocabulary = ["cat", "dog", "bird"]

def one_hot_encode(word, vocab):
    vector = [0] * len(vocab)
    if word in vocab:
        vector[vocab.index(word)] = 1
    return vector

# Examples
print(f"'cat' encoded: {one_hot_encode('cat', vocabulary)}")  # [1, 0, 0]
print(f"'dog' encoded: {one_hot_encode('dog', vocabulary)}")  # [0, 1, 0]
print(f"'bird' encoded: {one_hot_encode('bird', vocabulary)}")  # [0, 0, 1]

'cat' encoded: [1, 0, 0]
'dog' encoded: [0, 1, 0]
'bird' encoded: [0, 0, 1]
