# NLP Practice
#### Made by: ALI ALSINAN

## Step 1: Tokenization and Indexing of Given Corpus 

In [1]:
# 1) Define a corpus

corpus = [
    "I like NLP",
    "I like ML",
    "I like like ML"
]

In [2]:
# 2) Tokenization

# Simple word tokenization (lowercasing is common in practice)
print("Tokenized corpus:")
tokenized_corpus = []
for d in corpus:
    tokenized_document = d.lower().split()
    tokenized_corpus.append(tokenized_document)
    print(tokenized_document)

Tokenized corpus:
['i', 'like', 'nlp']
['i', 'like', 'ml']
['i', 'like', 'like', 'ml']


In [3]:
# 3) Build vocabulary

vocab_with_duplicates = []
for document in tokenized_corpus:
    for word in document:
        vocab_with_duplicates.append(word)

# Remove duplicates and sort
vocab = sorted(set(vocab_with_duplicates))
print(vocab)


['i', 'like', 'ml', 'nlp']


In [4]:
# 4) Build word2idx

word2idx = {}
for i, word in enumerate(vocab):
    word2idx[word] = i

print("word2idx:", word2idx)

word2idx: {'i': 0, 'like': 1, 'ml': 2, 'nlp': 3}


In [5]:
# 5) Combined tokenization + vocabulary + word2idx

print("Tokenized corpus:", tokenized_corpus)
print("Vocab:", vocab)
print("word2idx:", word2idx)

Tokenized corpus: [['i', 'like', 'nlp'], ['i', 'like', 'ml'], ['i', 'like', 'like', 'ml']]
Vocab: ['i', 'like', 'ml', 'nlp']
word2idx: {'i': 0, 'like': 1, 'ml': 2, 'nlp': 3}


## Step 2: Bag-of-Words Representation

In [6]:
# 1) Bag of Words Vector Function

import numpy as np

def BOW_vector(tokens, word2idx):
    vector = np.zeros(len(word2idx), dtype=int)
    for word in tokens:
        vector[word2idx[word]] += 1
    return vector


In [7]:
# 2) Finding Bag-of-Words Matrix

BOWs = []
for tokens in tokenized_corpus:
    BOWs.append(BOW_vector(tokens, word2idx))
BOWs = np.vstack(BOWs)

print("Vocab order:", vocab)
print("BoW matrix:\n", BOWs)

Vocab order: ['i', 'like', 'ml', 'nlp']
BoW matrix:
 [[1 1 0 1]
 [1 1 1 0]
 [1 2 1 0]]
