In [1]:
import torch
import torchtext
import torchdata

#torchtext.disable_torchtext_deprecation_warning()
print("Torch version:", torch.__version__)
print("TorchText version:", torchtext.__version__)
print("CUDA available:", torch.cuda.is_available())

print(torchdata.__version__)


Torch version: 2.3.1+cu118
TorchText version: 0.18.0+cpu
CUDA available: True
0.9.0


In [2]:
import torch
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# --- Step 1: Tokenizer
tokenizer = get_tokenizer('basic_english')

# --- Step 2: Load IMDB dataset
train_iter, test_iter = IMDB(split=('train', 'test'))

# --- Step 3: Build vocabulary from train data
def yield_tokens(data_iter):
    for _, text in data_iter:  # ignore labels
        yield tokenizer(text)

TEXT = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
TEXT.set_default_index(TEXT["<unk>"])

# --- Step 4: Reset iterators
train_iter, test_iter = IMDB(split=('train', 'test'))

# --- Step 5: Prepare data for Word2Vec (token IDs)
def collate_batch(batch):
    texts = [torch.tensor(TEXT(tokenizer(text)), dtype=torch.long) for _, text in batch]
    return texts

train_data = list(train_iter)
test_data  = list(test_iter)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=8, shuffle=True, collate_fn=collate_batch)
test_loader  = torch.utils.data.DataLoader(test_data, batch_size=8, shuffle=False, collate_fn=collate_batch)

# --- Step 6: Inspect one batch
for texts in train_loader:
    print("Batch size:", len(texts))
    print("Example token IDs:", texts[0][:10])  # first 10 tokens of first text
    break


################################################################################
The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a
future torchdata release! Please see https://github.com/pytorch/data/issues/1196
to learn more and leave feedback.
################################################################################



Batch size: 8
Example token IDs: tensor([  10,    8,   15,  223,   37,   93,    2,    5, 2288, 1463])


In [3]:
# Load IMDB iterators
train_iter, test_iter = IMDB(split=('train', 'test'))

# Convert iterators to lists
train_data = list(train_iter)
eval_data  = list(test_iter)

# Training data
train_texts  = [text for _, text in train_data]
train_labels = [label for label, _ in train_data]

# Evaluation (test) data
eval_texts  = [text for _, text in eval_data]
eval_labels = [label for label, _ in eval_data]

# Quick check
print("Sample train review:", train_texts[0][:200], "...")  # print first 200 chars
print("Sample train label:", train_labels[0])
print("Sample eval review:", eval_texts[0][:200], "...")
print("Sample eval label:", eval_labels[0])


Sample train review: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ev ...
Sample train label: 1
Sample eval review: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Bab ...
Sample eval label: 1


In [4]:
from collections import Counter
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")
min_word_freq = 20  # example threshold

# Count word frequencies
counter = Counter()
for text in train_texts:
    counter.update(tokenizer(text))

# Keep only tokens that meet frequency threshold
filtered_tokens = [tok for tok, freq in counter.items() if freq >= min_word_freq]

# Build vocab
vocab = build_vocab_from_iterator([filtered_tokens], specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])  # default for unknown tokens

# Test
# Print vocab info
print("Vocabulary size:", len(vocab))
unk_index = vocab["<unk>"]
print("Index of <unk>:", unk_index)

# First 20 tokens
print("First 20 tokens:", vocab.get_itos()[:20])  # get_itos() works too
# or simply: print(vocab.itos[:20])


Vocabulary size: 13351
Index of <unk>: 0
First 20 tokens: ['<unk>', '!', '#1', '#2', '$', '$1', '$10', '$100', '$2', '$3', '$4', '$5', '&', "'", '(', ')', '*', '**', '***', '****']


In [5]:
sample_review = "this movie was absolutely amazing"
tokens = tokenizer(sample_review)  # ['this', 'movie', 'was', 'absolutely', 'amazing']

# Convert all tokens to IDs at once
ids = vocab.lookup_indices(tokens)  # <- expects a list of tokens

print("Tokens:", tokens)
print("IDs:", ids)


Tokens: ['this', 'movie', 'was', 'absolutely', 'amazing']
IDs: [12012, 7895, 12913, 286, 622]


In [6]:
vocab_size=vocab.__len__()
print(vocab_size)

13351


In [7]:
window_size = 4  # context window size of 9
max_seq_len = 256
max_norm = 1
embed_dim = 300
batch_size = 16

unk_index = 0  # default index for unknown words

# Text pipeline: map tokens → indices
text_pipeline = lambda tokens: vocab(tokens)  # vocab is callable now

sample_text = "Hello World"
tokens = tokenizer(sample_text)  # ['Hello', 'World']

# Map tokens to IDs
sample_ids = text_pipeline(tokens)

print("Tokens:", tokens)
print("Token IDs:", sample_ids)
print("Type:", type(sample_ids))



Tokens: ['hello', 'world']
Token IDs: [5679, 13190]
Type: <class 'list'>


In [8]:
import torch

window_size = 2  # context window
max_seq_len = 256
unk_index = 0    # index of <unk> in vocab

# Text pipeline using new torchtext Vocab API
def text_pipeline(text):
    tokens = tokenizer(text)
    return [vocab[token] if token in vocab else unk_index for token in tokens]

# Skip-gram collate function
def collate_skipgram(batch, text_pipeline, window_size=4, max_seq_len=256):
    batch_input_word, batch_target_words = [], []

    for review in batch:
        # Convert review (string) → token ids
        review_tokens_ids = text_pipeline(review)

        # Skip short reviews
        if len(review_tokens_ids) < window_size * 2 + 1:
            continue

        # Truncate if needed
        if max_seq_len:
            review_tokens_ids = review_tokens_ids[:max_seq_len]

        # Sliding window
        for idx in range(len(review_tokens_ids)):
            input_word = review_tokens_ids[idx]

            # Context window boundaries
            start = max(idx - window_size, 0)
            end   = min(idx + window_size + 1, len(review_tokens_ids))

            # Context words = all words in window except the input word itself
            context_words = [review_tokens_ids[i] for i in range(start, end) if i != idx]

            # Append pairs
            batch_input_word.extend([input_word] * len(context_words))
            batch_target_words.extend(context_words)

    # Convert to tensors
    batch_input_word = torch.tensor(batch_input_word, dtype=torch.long)
    batch_target_words = torch.tensor(batch_target_words, dtype=torch.long)

    return batch_input_word, batch_target_words


In [9]:
inputs, targets = collate_skipgram(
    ["this movie was absolutely fantastic and i loved it"],
    text_pipeline,
    window_size=4
)

print("Inputs shape:", inputs.shape)
print("Targets shape:", targets.shape)

if inputs.size(0) > 0:
    print("Example input word:", inputs[0].item())
    print("Example target word:", targets[0].item())
else:
    print("No training pairs generated")


Inputs shape: torch.Size([52])
Targets shape: torch.Size([52])
Example input word: 12012
Example target word: 7895


In [11]:
import torch
import torch.nn as nn

# --- SkipGram model
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embed_dim=300, max_norm=1):
        super().__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim,
            max_norm=max_norm
        )
        self.linear = nn.Linear(
            in_features=embed_dim,
            out_features=vocab_size
        )

    def forward(self, x):
        """
        x shape: [batch_size]
        """
        x = self.embeddings(x)  # [batch_size, embed_dim]
        x = self.linear(x)      # [batch_size, vocab_size]
        return x

# --- Example: instantiate model
vocab_size = len(vocab)  # your vocab object
model = SkipGram(vocab_size, embed_dim=300).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))


In [12]:
import torch
from torch.utils.data import DataLoader, TensorDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- 1. Convert all text to token IDs
mapped_train_data = [text_pipeline(text) for text in train_texts]
mapped_eval_data  = [text_pipeline(text) for text in eval_texts]

# --- 2. Precompute skip-gram pairs
def generate_skipgram_pairs(data, window_size=4):
    inputs, targets = [], []
    for review_ids in data:
        if len(review_ids) < window_size * 2 + 1:
            continue
        for idx in range(len(review_ids) - window_size * 2):
            seq = review_ids[idx: idx + window_size * 2 + 1]
            input_word = seq[window_size]
            context = seq[:window_size] + seq[window_size+1:]
            for target_word in context:
                inputs.append(input_word)
                targets.append(target_word)
    return torch.tensor(inputs, dtype=torch.long), torch.tensor(targets, dtype=torch.long)

train_inputs, train_targets = generate_skipgram_pairs(mapped_train_data, window_size=4)
eval_inputs, eval_targets   = generate_skipgram_pairs(mapped_eval_data, window_size=4)

# --- 3. Create TensorDataset & DataLoader
traindl_skipgram = DataLoader(
    TensorDataset(train_inputs, train_targets),
    batch_size=128,
    shuffle=True,
    num_workers=4
)

evaldl_skipgram = DataLoader(
    TensorDataset(eval_inputs, eval_targets),
    batch_size=128,
    shuffle=False,
    num_workers=4
)

# --- 4. Move model to GPU
model = SkipGram(len(vocab), embed_dim=300).to(device)

# --- 5. Training example
for xb, yb in traindl_skipgram:
    xb, yb = xb.to(device), yb.to(device)
    out = model(xb)
    print("Batch on device:", out.device)
    break


Batch on device: cuda:0


In [13]:
def text_pipeline(text):
    return vocab(tokenizer(text))

def collate_skipgram(batch, text_pipeline, window_size=4, max_seq_len=256):
    batch_input_word, batch_target_words = [], []

    for _, review in batch: # The batch is a list of (label, text) tuples
        review_tokens_ids = text_pipeline(review)
        
        # Skip short reviews
        if len(review_tokens_ids) < window_size * 2 + 1:
            continue
            
        # Truncate if needed
        if max_seq_len:
            review_tokens_ids = review_tokens_ids[:max_seq_len]

        for idx in range(len(review_tokens_ids)):
            input_word = review_tokens_ids[idx]
            
            # Context window boundaries
            start = max(idx - window_size, 0)
            end   = min(idx + window_size + 1, len(review_tokens_ids))
            
            # Context words = all words in window except the input word itself
            context_words = [review_tokens_ids[i] for i in range(start, end) if i != idx]
            
            # Append pairs
            batch_input_word.extend([input_word] * len(context_words))
            batch_target_words.extend(context_words)

    # Convert to tensors
    if not batch_input_word:
        return torch.tensor([]), torch.tensor([])
        
    batch_input_word = torch.tensor(batch_input_word, dtype=torch.long)
    batch_target_words = torch.tensor(batch_target_words, dtype=torch.long)
    
    return batch_input_word, batch_target_words

In [14]:
# --- SkipGram model
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embed_dim=300, max_norm=1):
        super().__init__()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim,
            max_norm=max_norm
        )
        self.linear = nn.Linear(
            in_features=embed_dim,
            out_features=vocab_size
        )
    def forward(self, x):
        x = self.embeddings(x)
        x = self.linear(x)
        return x

# --- Create DataLoaders using the collate function
batch_size = 128
traindl_skipgram = DataLoader(
    train_iter,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=partial(collate_skipgram, text_pipeline=text_pipeline, window_size=4)
)

evaldl_skipgram = DataLoader(
    test_iter,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=partial(collate_skipgram, text_pipeline=text_pipeline, window_size=4)
)

# Test the DataLoader and model
vocab_size = len(vocab)
model = SkipGram(vocab_size, embed_dim=300).to(device)

for xb, yb in traindl_skipgram:
    print(f"Batch shapes: {xb.shape}, {yb.shape}")
    out = model(xb.to(device))
    print("Model output shape:", out.shape)
    break

NameError: name 'partial' is not defined