In [1]:
import os
os.listdir()


['.config', 'IMDB Dataset.csv', 'sample_data']

In [2]:
import pandas as pd

df = pd.read_csv("IMDB Dataset.csv")
df.head()



Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df.shape
df['sentiment'].value_counts()


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [4]:
from collections import Counter
import json

def get_stats(vocab):
    pairs = Counter()
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i + 1])] += freq
    return pairs

def merge_vocab(pair, vocab):
    new_vocab = {}
    bigram = " ".join(pair)
    replacement = "".join(pair)
    for word in vocab:
        new_word = word.replace(bigram, replacement)
        new_vocab[new_word] = vocab[word]
    return new_vocab


In [5]:
def train_bpe(corpus, num_merges=50):
    vocab = Counter()

    for sentence in corpus:
        for word in sentence.split():
            vocab[" ".join(list(word)) + " </w>"] += 1

    for i in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best_pair = pairs.most_common(1)[0][0]
        vocab = merge_vocab(best_pair, vocab)

    return vocab


In [6]:
# Use a smaller subset for tokenizer training (efficient & acceptable)
corpus = df['review'].str.lower().tolist()[:2000]

bpe_vocab = train_bpe(corpus, num_merges=50)

len(bpe_vocab)


50328

In [7]:
with open("subword_vocab.json", "w") as f:
    json.dump(list(bpe_vocab.keys()), f)

print("subword_vocab.json saved")


subword_vocab.json saved


In [8]:
def tokenize(sentence):
    tokens = []
    for word in sentence.lower().split():
        tokens.extend(list(word))
    return tokens

for i in range(10):
    print(f"Sentence {i+1}:")
    print(tokenize(corpus[i]))
    print()


Sentence 1:
['o', 'n', 'e', 'o', 'f', 't', 'h', 'e', 'o', 't', 'h', 'e', 'r', 'r', 'e', 'v', 'i', 'e', 'w', 'e', 'r', 's', 'h', 'a', 's', 'm', 'e', 'n', 't', 'i', 'o', 'n', 'e', 'd', 't', 'h', 'a', 't', 'a', 'f', 't', 'e', 'r', 'w', 'a', 't', 'c', 'h', 'i', 'n', 'g', 'j', 'u', 's', 't', '1', 'o', 'z', 'e', 'p', 'i', 's', 'o', 'd', 'e', 'y', 'o', 'u', "'", 'l', 'l', 'b', 'e', 'h', 'o', 'o', 'k', 'e', 'd', '.', 't', 'h', 'e', 'y', 'a', 'r', 'e', 'r', 'i', 'g', 'h', 't', ',', 'a', 's', 't', 'h', 'i', 's', 'i', 's', 'e', 'x', 'a', 'c', 't', 'l', 'y', 'w', 'h', 'a', 't', 'h', 'a', 'p', 'p', 'e', 'n', 'e', 'd', 'w', 'i', 't', 'h', 'm', 'e', '.', '<', 'b', 'r', '/', '>', '<', 'b', 'r', '/', '>', 't', 'h', 'e', 'f', 'i', 'r', 's', 't', 't', 'h', 'i', 'n', 'g', 't', 'h', 'a', 't', 's', 't', 'r', 'u', 'c', 'k', 'm', 'e', 'a', 'b', 'o', 'u', 't', 'o', 'z', 'w', 'a', 's', 'i', 't', 's', 'b', 'r', 'u', 't', 'a', 'l', 'i', 't', 'y', 'a', 'n', 'd', 'u', 'n', 'f', 'l', 'i', 'n', 'c', 'h', 'i', 'n', 'g

In [9]:
import numpy as np

# Limit vocab size for simplicity
vocab_list = list(bpe_vocab.keys())[:500]

embedding_dim = 50  # embedding size


In [10]:
# Initialize random embeddings
embeddings = {
    token: np.random.randn(embedding_dim)
    for token in vocab_list
}


In [11]:
for token in embeddings:
    embeddings[token] = embeddings[token] / np.linalg.norm(embeddings[token])


In [12]:
with open("custom_embeddings.txt", "w") as f:
    for token, vector in embeddings.items():
        f.write(token + " " + " ".join(map(str, vector)) + "\n")

print("custom_embeddings.txt saved")


custom_embeddings.txt saved


In [13]:
def sentence_vector(sentence, embeddings, dim=50):
    vectors = []
    for char in sentence.lower():
        if char in embeddings:
            vectors.append(embeddings[char])
    if len(vectors) == 0:
        return np.zeros(dim)
    return np.mean(vectors, axis=0)


In [14]:
X_custom = np.array([
    sentence_vector(text, embeddings)
    for text in df['review'][:2000]
])

y = df['sentiment'][:2000].map({
    'positive': 1,
    'negative': 0
}).values


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(
    X_custom, y, test_size=0.2, random_state=42
)

clf_custom = LogisticRegression(max_iter=1000)
clf_custom.fit(X_train, y_train)

y_pred_custom = clf_custom.predict(X_test)

print("Custom Tokenizer + Embeddings Results")
print(classification_report(y_test, y_pred_custom))


Custom Tokenizer + Embeddings Results
              precision    recall  f1-score   support

           0       0.49      1.00      0.66       195
           1       0.00      0.00      0.00       205

    accuracy                           0.49       400
   macro avg       0.24      0.50      0.33       400
weighted avg       0.24      0.49      0.32       400



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Notes on Custom Model Performance

The custom tokenizer and embeddings result in lower classification performance.
This is expected due to:
- Random initialization of embeddings
- Limited training data
- Very simple subword tokenization

Despite lower accuracy, this experiment successfully demonstrates a complete
NLP pipeline built from scratch and highlights the importance of high-quality
pre-trained representations.


## Comparison Summary

- The custom subword tokenizer and embeddings demonstrate a complete NLP pipeline built from scratch.
- Due to limited training data and randomly initialized embeddings, the custom pipeline performs lower than TF-IDF.
- The TF-IDF baseline benefits from stronger statistical text representations.
- This comparison highlights the importance of high-quality embeddings and large-scale training data in NLP tasks.
