In [5]:
import pandas as pd

df = pd.read_csv(
    "IMDB Dataset.csv",
    engine="python",
    encoding="utf-8",
    on_bad_lines="skip"
)

df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df.shape
df['sentiment'].value_counts()


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,13064
negative,13014


In [7]:
from collections import Counter
import json

def get_stats(vocab):
    pairs = Counter()
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols)-1):
            pairs[(symbols[i], symbols[i+1])] += freq
    return pairs

def merge_vocab(pair, vocab):
    new_vocab = {}
    bigram = " ".join(pair)
    replacement = "".join(pair)
    for word in vocab:
        new_word = word.replace(bigram, replacement)
        new_vocab[new_word] = vocab[word]
    return new_vocab


In [8]:
def train_bpe(corpus, num_merges=50):
    vocab = Counter()
    for sentence in corpus:
        for word in sentence.split():
            vocab[" ".join(list(word)) + " </w>"] += 1

    for _ in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best = pairs.most_common(1)[0][0]
        vocab = merge_vocab(best, vocab)

    return vocab


In [9]:
corpus = df['review'].str.lower().tolist()[:2000]
bpe_vocab = train_bpe(corpus)

len(bpe_vocab)


50328

In [10]:
with open("subword_vocab.json", "w") as f:
    json.dump(list(bpe_vocab.keys()), f)

print("subword_vocab.json saved")


subword_vocab.json saved


In [11]:
def tokenize(sentence):
    tokens = []
    for word in sentence.lower().split():
        tokens.extend(list(word))
    return tokens

for i in range(10):
    print(tokenize(corpus[i]))


['o', 'n', 'e', 'o', 'f', 't', 'h', 'e', 'o', 't', 'h', 'e', 'r', 'r', 'e', 'v', 'i', 'e', 'w', 'e', 'r', 's', 'h', 'a', 's', 'm', 'e', 'n', 't', 'i', 'o', 'n', 'e', 'd', 't', 'h', 'a', 't', 'a', 'f', 't', 'e', 'r', 'w', 'a', 't', 'c', 'h', 'i', 'n', 'g', 'j', 'u', 's', 't', '1', 'o', 'z', 'e', 'p', 'i', 's', 'o', 'd', 'e', 'y', 'o', 'u', "'", 'l', 'l', 'b', 'e', 'h', 'o', 'o', 'k', 'e', 'd', '.', 't', 'h', 'e', 'y', 'a', 'r', 'e', 'r', 'i', 'g', 'h', 't', ',', 'a', 's', 't', 'h', 'i', 's', 'i', 's', 'e', 'x', 'a', 'c', 't', 'l', 'y', 'w', 'h', 'a', 't', 'h', 'a', 'p', 'p', 'e', 'n', 'e', 'd', 'w', 'i', 't', 'h', 'm', 'e', '.', '<', 'b', 'r', '/', '>', '<', 'b', 'r', '/', '>', 't', 'h', 'e', 'f', 'i', 'r', 's', 't', 't', 'h', 'i', 'n', 'g', 't', 'h', 'a', 't', 's', 't', 'r', 'u', 'c', 'k', 'm', 'e', 'a', 'b', 'o', 'u', 't', 'o', 'z', 'w', 'a', 's', 'i', 't', 's', 'b', 'r', 'u', 't', 'a', 'l', 'i', 't', 'y', 'a', 'n', 'd', 'u', 'n', 'f', 'l', 'i', 'n', 'c', 'h', 'i', 'n', 'g', 's', 'c',

In [12]:
tokenized_corpus = [list(text.lower()) for text in corpus]


In [13]:
from collections import Counter

vocab_counter = Counter([t for sent in tokenized_corpus for t in sent])
vocab = {w:i for i,(w,_) in enumerate(vocab_counter.most_common(500))}
index_to_word = {i:w for w,i in vocab.items()}


In [14]:
import torch

embedding_dim = 50
embeddings = torch.randn(len(vocab), embedding_dim, requires_grad=True)

optimizer = torch.optim.SGD([embeddings], lr=0.05)
loss_fn = torch.nn.MSELoss()


In [19]:
import torch

vocab_size = len(vocab)
co_matrix = torch.zeros(vocab_size, vocab_size)

window_size = 2
max_sentences = 300

for sentence in tokenized_corpus[:max_sentences]:
    indexed = [vocab[t] for t in sentence if t in vocab]
    for i, center in enumerate(indexed):
        for j in range(max(0, i-window_size), min(len(indexed), i+window_size+1)):
            if i != j:
                co_matrix[center, indexed[j]] += 1

# Train embeddings using matrix factorization idea
embedding_dim = 50
embeddings = torch.randn(vocab_size, embedding_dim, requires_grad=True)
optimizer = torch.optim.SGD([embeddings], lr=0.1)

epochs = 10
for epoch in range(epochs):
    optimizer.zero_grad()
    loss = torch.mean((embeddings @ embeddings.T - co_matrix) ** 2)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")


Epoch 1/10, Loss: 1140024.3750
Epoch 2/10, Loss: 1119276.1250
Epoch 3/10, Loss: 748455.8750
Epoch 4/10, Loss: 5392300.5000
Epoch 5/10, Loss: 33372665856.0000
Epoch 6/10, Loss: 7293276729451577606144.0000
Epoch 7/10, Loss: inf
Epoch 8/10, Loss: inf
Epoch 9/10, Loss: nan
Epoch 10/10, Loss: nan


In [20]:
with open("custom_embeddings.txt", "w") as f:
    for idx, word in index_to_word.items():
        vec = embeddings[idx].detach().numpy()
        f.write(word + " " + " ".join(map(str, vec)) + "\n")

print("custom_embeddings.txt saved")


custom_embeddings.txt saved


In [21]:
import numpy as np

def sentence_vector(text):
    vecs = []
    for ch in text.lower():
        if ch in vocab:
            vecs.append(embeddings[vocab[ch]].detach().numpy())
    return np.mean(vecs, axis=0) if vecs else np.zeros(50)


In [22]:
X_custom = np.array([sentence_vector(t) for t in corpus])
y = df['sentiment'][:2000].map({'positive':1,'negative':0}).values


In [24]:

X_custom = np.nan_to_num(X_custom)


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X_custom, y, test_size=0.2)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

print("Custom embeddings results")
print(classification_report(y_test, clf.predict(X_test)))


Custom embeddings results
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       204
           1       0.49      1.00      0.66       196

    accuracy                           0.49       400
   macro avg       0.24      0.50      0.33       400
weighted avg       0.24      0.49      0.32       400



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(corpus)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2)

clf2 = LogisticRegression(max_iter=1000)
clf2.fit(X_train, y_train)

print("TF-IDF baseline")
print(classification_report(y_test, clf2.predict(X_test)))


TF-IDF baseline
              precision    recall  f1-score   support

           0       0.87      0.82      0.84       210
           1       0.81      0.86      0.84       190

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400

