# TRAIN AND LEARN OF word_language_model

## Model code

## Uploading files from Google Disk

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
from __future__ import unicode_literals, print_function, division
import os
from io import open
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
import time
import random
import argparse
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from collections import Counter
from typing import Dict, List, Union, Tuple, Optional

### data.py

In [29]:
class Dictionary(object):
    def __init__(self) -> None:
        self.word2idx: Dict[str, int] = {}
        self.idx2word: List[str] = []
        self.word2count: Counter = Counter()

    def add_word(self, word: str) -> int:
        self.word2count[word] += 1
        return self.word2idx.get(word, -1)

    def finalize(self, min_freq: int = 5) -> None:
        for word, count in self.word2count.items():
            if count >= min_freq and word not in self.word2idx:
                self.idx2word.append(word)
                self.word2idx[word] = len(self.idx2word) - 1
        self.word2idx['<unk>'] = len(self.idx2word)
        self.idx2word.append('<unk>')

    def __len__(self) -> int:
        return len(self.idx2word)

class Corpus(object):
  """Corpus class for loading and tokenizing text data."""
  def __init__(self, path: str, min_freq: int = 5) -> None:
      self.dictionary: Dictionary = Dictionary()
      self.min_freq: int = min_freq
      self.train: torch.Tensor = self.tokenize(os.path.join(path, 'train.txt'))
      self.valid: torch.Tensor = self.tokenize(os.path.join(path, 'valid.txt'))
      self.test: torch.Tensor = self.tokenize(os.path.join(path, 'test.txt'))

  def tokenize(self, path: str) -> torch.Tensor:
      """Tokenizes a text file."""
      assert os.path.exists(path)
      # Add words to the dictionary
      with open(path, 'r', encoding="utf8") as f:
          for line in f:
              words: List[str] = line.split() + ['<eos>']
              for word in words:
                  self.dictionary.add_word(word)
      self.dictionary.finalize(self.min_freq)
      # Tokenize file content
      with open(path, 'r', encoding="utf8") as f:
          idss: List[torch.Tensor] = []
          for line in f:
              words: List[str] = line.split() + ['<eos>']
              ids: List[int] = []
              for word in words:
                  idx = self.dictionary.word2idx.get(word, self.dictionary.word2idx['<unk>'])
                  ids.append(idx)
              idss.append(torch.tensor(ids, dtype=torch.int64))
          ids_tensor: torch.Tensor = torch.cat(idss)

      return ids_tensor

### model.py

In [30]:
# ===============================
# MODEL ARCHITECTURES (model.py)
# ===============================

class RNNModel(nn.Module):
    """RNN-based language model (LSTM/GRU/RNN)."""

    def __init__(
        self,
        rnn_type: str,
        ntoken: int,
        ninp: int,
        nhid: int,
        nlayers: int,
        dropout: float = 0.5,
        tie_weights: bool = False
    ) -> None:
        super(RNNModel, self).__init__()
        self.ntoken: int = ntoken
        self.rnn_type: str = rnn_type
        self.nhid: int = nhid
        self.nlayers: int = nlayers

        self.drop: nn.Dropout = nn.Dropout(dropout)
        self.encoder: nn.Embedding = nn.Embedding(ntoken, ninp)

        if rnn_type in ['LSTM', 'GRU']:
            self.rnn: nn.Module = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity: str = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError as e:
                raise ValueError(
                    "Invalid option for `--model`. "
                    "Options are ['LSTM', 'GRU', 'RNN_TANH', 'RNN_RELU']"
                ) from e
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)

        self.decoder: nn.Linear = nn.Linear(nhid, ntoken)

        # Tie weights
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

    def init_weights(self) -> None:
        """Initialize weights."""
        initrange: float = 0.1
        nn.init.uniform_(self.encoder.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, input: torch.Tensor, hidden: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        emb: torch.Tensor = self.drop(self.encoder(input))
        output: torch.Tensor
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded: torch.Tensor = self.decoder(output)
        decoded = decoded.view(-1, self.ntoken)
        return F.log_softmax(decoded, dim=1), hidden

    def init_hidden(self, bsz: int) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """Initialize hidden state."""
        weight: torch.Tensor = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (
                weight.new_zeros(self.nlayers, bsz, self.nhid),
                weight.new_zeros(self.nlayers, bsz, self.nhid)
            )
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

# Temporarily leave PositionalEncoding module here. Will be moved somewhere else.
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens in the sequence.
        The positional encodings have the same dimension as the embeddings, so that the two can be summed.
        Here, we use sine and cosine functions of different frequencies.
    .. math:
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    """Positional encoding for Transformer."""

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000) -> None:
        super(PositionalEncoding, self).__init__()
        self.dropout: nn.Dropout = nn.Dropout(p=dropout)

        pe: torch.Tensor = torch.zeros(max_len, d_model)
        position: torch.Tensor = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term: torch.Tensor = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Transformer):
    """Transformer-based language model."""

    def __init__(
        self,
        ntoken: int,
        ninp: int,
        nhead: int,
        nhid: int,
        nlayers: int,
        dropout: float = 0.5
    ) -> None:
        super(TransformerModel, self).__init__(
            d_model=ninp,
            nhead=nhead,
            dim_feedforward=nhid,
            num_encoder_layers=nlayers
        )
        self.model_type: str = 'Transformer'
        self.src_mask: Optional[torch.Tensor] = None
        self.pos_encoder: PositionalEncoding = PositionalEncoding(ninp, dropout)

        self.input_emb: nn.Embedding = nn.Embedding(ntoken, ninp)
        self.ninp: int = ninp
        self.decoder: nn.Linear = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz: int) -> torch.Tensor:
        """Generate mask for causal attention."""
        return torch.log(torch.tril(torch.ones(sz, sz)))

    def init_weights(self) -> None:
        """Initialize weights."""
        initrange: float = 0.1
        nn.init.uniform_(self.input_emb.weight, -initrange, initrange)
        nn.init.zeros_(self.decoder.bias)
        nn.init.uniform_(self.decoder.weight, -initrange, initrange)

    def forward(self, src: torch.Tensor, has_mask: bool = True) -> torch.Tensor:
        if has_mask:
            device: torch.device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask: torch.Tensor = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.input_emb(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output: torch.Tensor = self.encoder(src, mask=self.src_mask)
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)

In [31]:
# Label smoothing loss
class LabelSmoothingLoss(nn.Module):
    def __init__(self, smoothing: float = 0.0) -> None:
        super(LabelSmoothingLoss, self).__init__()
        self.smoothing: float = smoothing
        self.confidence: float = 1.0 - smoothing

    def forward(self, output: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        log_probs: torch.Tensor = output
        n_classes: int = log_probs.size(-1)
        with torch.no_grad():
            true_dist: torch.Tensor = torch.zeros_like(log_probs)
            true_dist.fill_(self.smoothing / (n_classes - 1))
            true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * log_probs, dim=-1))

### Test train on original data

In [48]:
# ===============================
# TRAINING & EVALUATION (main.py)
# ===============================

def get_lr(step: float, d_model: float, warmup_steps: int) -> float:
    lr: float = d_model ** -0.5 * min(step ** -0.5, step * warmup_steps ** -1.5)
    return lr

def batchify(data: torch.Tensor, bsz: int, device: torch.device) -> torch.Tensor:
    """Divide data into batches."""
    nbatch: int = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)


def get_batch(source: torch.Tensor, i: int, bptt: int) -> Tuple[torch.Tensor, torch.Tensor]:
    """Get a batch of data."""
    seq_len: int = min(bptt, len(source) - 1 - i)
    data: torch.Tensor = source[i:i+seq_len]
    target: torch.Tensor = source[i+1:i+1+seq_len].view(-1)
    return data, target


def repackage_hidden(h: Union[torch.Tensor, Tuple]) -> Union[torch.Tensor, Tuple]:
    """Detach hidden state from history."""
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

def top_k_sampling(logits: torch.Tensor, k: int, temperature: float = 1.0) -> Tuple[torch.Tensor, torch.Tensor]:
    values, indices = torch.topk(logits, k)
    values = values.div(temperature).exp()
    values = values / values.sum()
    return torch.multinomial(values, 1), indices

def evaluate(
    model: nn.Module,
    data_source: torch.Tensor,
    criterion: nn.Module,
    bptt: int,
    ntokens: int,
    eval_batch_size: int,
    is_transformer: bool
) -> float:
    """Evaluate the model."""
    model.eval()
    total_loss: float = 0.0
    if not is_transformer:
        hidden = model.init_hidden(eval_batch_size)

    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data: torch.Tensor
            targets: torch.Tensor
            data, targets = get_batch(data_source, i, bptt)

            if is_transformer:
                output: torch.Tensor = model(data)
                output = output.view(-1, ntokens)
            else:
                output, hidden = model(data, hidden)
                hidden = repackage_hidden(hidden)

            total_loss += len(data) * criterion(output, targets).item()

    return total_loss / (len(data_source) - 1)


def train_epoch(
    model: nn.Module,
    train_data: torch.Tensor,
    criterion: nn.Module,
    optimizer: optim.Optimizer,
    epoch: int,
    bptt: int,
    ntokens: int,
    batch_size: int,
    clip: float,
    log_interval: int,
    is_transformer: bool,
    use_optimizer: bool = True,
    use_warmup: bool = False,
    step: int = 0,
    d_model: int = 512,
    warmup_steps: int = 4000,
    dry_run: bool = False
) -> int:
    """Train for one epoch."""
    model.train()
    total_loss: float = 0.0
    start_time: float = time.time()

    if not is_transformer:
        hidden = model.init_hidden(batch_size)

    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data: torch.Tensor
        targets: torch.Tensor
        data, targets = get_batch(train_data, i, bptt)

        optimizer.zero_grad()

        if is_transformer:
            output: torch.Tensor = model(data)
            output = output.view(-1, ntokens)
        else:
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)

        loss: torch.Tensor = criterion(output, targets)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        if use_warmup:
            for param_group in optimizer.param_groups:
                param_group['lr'] = get_lr(step + 1, d_model, warmup_steps)
        if use_optimizer:
            optimizer.step()
        else:
            for p in model.parameters():
                p.data.add_(p.grad, alpha=-optimizer.param_groups[0]['lr'])

        total_loss += loss.item()
        step += 1

        if batch % log_interval == 0 and batch > 0:
            cur_loss: float = total_loss / log_interval
            elapsed: float = time.time() - start_time
            print(
                f'| epoch {epoch:3d} | {batch:5d}/{len(train_data) // bptt:5d} batches | '
                f'lr {optimizer.param_groups[0]["lr"]:02.6f} | ms/batch {elapsed * 1000 / log_interval:5.2f} | '
                f'loss {cur_loss:5.2f} | ppl {math.exp(cur_loss):8.2f}'
            )
            total_loss = 0
            start_time = time.time()
        if dry_run:
            break
    return step

# model_type: str = 'Transformer',
#     data_path: str = '/content/drive/MyDrive/data_word_train/wikitext-2',
#     emsize: int = 512,
#     nhid: int = 2048,
#     nlayers: int = 6,
#     lr: float = 0.0001,
#     clip: float = 0.25,
#     epochs: int = 60,
#     batch_size: int = 32,
#     bptt: int = 50,
#     dropout: float = 0.1,
#     tied: bool = False,
#     nhead: int = 8,
#     log_interval: int = 200,
#     save_path: str = 'model_1.pt',
#     onnx_export: str = '',
#     dry_run: bool = False,
#     accel: bool = True,
#     use_optimizer: bool = True,
#     optimizer: Optional[optim.Optimizer] = None,
#     criterion: Optional[nn.Module] = None,
#     use_label_smoothing: bool = True,
#     label_smoothing: float = 0.1,
#     use_warmup: bool = True,
#     warmup_steps: int = 4000,
#     min_freq: int = 5


def train_model(
    model_type: str = 'LSTM', # RNN_TANH, RNN_RELU, LSTM, GRU, Transformer
    data_path: str = '/content/drive/MyDrive/data_word_train/wikitext-2',
    emsize: int = 200,
    nhid: int = 200,
    nlayers: int = 2,
    lr: float = 0.001,
    clip: float = 0.25,
    epochs: int = 40,
    batch_size: int = 20,
    bptt: int = 35,
    dropout: float = 0.2,
    tied: bool = False,
    nhead: int = 2,
    log_interval: int = 200,
    save_path: str = 'model.pt',
    onnx_export: str = '',
    dry_run: bool = False,
    accel: bool = True,
    use_optimizer: bool = True,
    optimizer: Optional[optim.Optimizer] = None,
    criterion: Optional[nn.Module] = None,
    use_label_smoothing: bool = False,
    label_smoothing: float = 0.1,
    use_warmup: bool = False,
    warmup_steps: int = 4000,
    min_freq: int = 5
) -> None:
    """Main training function."""

    if data_path == '/content/drive/MyDrive/data_word_train/wikitext-2':
      if not os.path.exists(data_path):
          print("Downloading Wikitext-2 dataset...")
          !wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip -P /content/
          !unzip /content/wikitext-2-v1.zip -d /content/data_word_train/
          !mkdir -p /content/drive/MyDrive/data_word_train/
          !mv /content/data_word_train/wikitext-2 /content/drive/MyDrive/data_word_train/
          print("Wikitext-2 dataset moved to Google Drive")

    # Set device
    device: torch.device = torch.device('cuda' if accel and torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load data
    corpus: Corpus = Corpus(data_path, min_freq=min_freq)
    print(f"Vocabulary size: {len(corpus.dictionary)}")

    eval_batch_size: int = 10
    train_data: torch.Tensor = batchify(corpus.train, batch_size, device)
    val_data: torch.Tensor = batchify(corpus.valid, eval_batch_size, device)
    test_data: torch.Tensor = batchify(corpus.test, eval_batch_size, device)

    # Build model
    ntokens: int = len(corpus.dictionary)
    is_transformer: bool = model_type == 'Transformer'

    if is_transformer:
        model: nn.Module = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
    else:
        model = RNNModel(model_type, ntokens, emsize, nhid, nlayers, dropout, tied).to(device)

    # Loss and optimizer (Adam with weight_decay as in Transformer paper)
    # optimizer: optim.Adam = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
    criterion: nn.Module = criterion if criterion is not None else (LabelSmoothingLoss(smoothing=label_smoothing) if use_label_smoothing else nn.NLLLoss())
    optimizer: optim.Optimizer = optimizer if optimizer is not None else optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    # criterion = LabelSmoothingLoss(smoothing=args.label_smoothing)
    # optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(0.9, 0.98), eps=1e-9)

    scheduler: optim.lr_scheduler.ReduceLROnPlateau = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=2
    ) if not use_warmup else None

    # Training loop
    best_val_loss: Optional[float] = None
    global_step: int = 0

    try:
        for epoch in range(1, epochs + 1):
            epoch_start_time: float = time.time()

            global_step = train_epoch(
                model, train_data, criterion, optimizer, epoch,
                bptt, ntokens, batch_size, clip, log_interval, is_transformer,
                use_optimizer, use_warmup, global_step, emsize, warmup_steps
            )

            val_loss: float = evaluate(
                model, val_data, criterion, bptt, ntokens,
                eval_batch_size, is_transformer
            )

            print('-' * 89)
            print(
                f'| end of epoch {epoch:3d} | time: {time.time() - epoch_start_time:5.2f}s | '
                f'valid loss {val_loss:5.2f} | valid ppl {math.exp(val_loss):8.2f}'
            )
            print('-' * 89)

            # Save best model
            if not best_val_loss or val_loss < best_val_loss:
                with open(save_path, 'wb') as f:
                    torch.save(model, f)
                best_val_loss = val_loss

            # Learning rate scheduling
            # lr /= 4.0
            if use_warmup:
                print(f"Current learning rate: {optimizer.param_groups[0]['lr']:.6f}")
            else:
                scheduler.step(val_loss)
                print(f"Current learning rate: {optimizer.param_groups[0]['lr']:.6f}")

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')
    safe_globals: List = [
        TransformerModel, PositionalEncoding,
        torch.nn.modules.dropout.Dropout, torch.nn.modules.linear.Linear,
        torch.nn.modules.transformer.TransformerEncoder,
        torch.nn.modules.transformer.TransformerEncoderLayer,
        torch.nn.modules.activation.MultiheadAttention,
        torch.nn.modules.linear.NonDynamicallyQuantizableLinear,
        torch.nn.modules.normalization.LayerNorm, torch.nn.functional.relu
    ]
    with torch.serialization.safe_globals(safe_globals):
        with open(save_path, 'rb') as f:
            model = torch.load(f, map_location=device)
    test_loss: float = evaluate(
        model, test_data, criterion, bptt, ntokens, eval_batch_size, is_transformer
    )
    print('=' * 89)
    print(
        f'| End of training | test loss {test_loss:5.2f} | '
        f'test ppl {math.exp(test_loss):8.2f}'
    )
    print('=' * 89)


In [46]:
# ===============================
# TEXT GENERATION (generate.py)
# ===============================

def generate_text(
    checkpoint: str = 'model.pt',
    data_path: str = '/content/drive/MyDrive/data_word_train/wikitext-2',
    outf: str = 'generated.txt',
    words: int = 1000,
    temperature: float = 1.0,
    top_k: int = 40,
    seed: int = 1111,
    log_interval: int = 100,
    accel: bool = True,
    min_freq: int = 5,
    use_top_k: bool = False
) -> None:
    """Generate text from trained model."""

    torch.manual_seed(seed)
    device: torch.device = torch.device('cuda' if accel and torch.cuda.is_available() else 'cpu')

    # Load model
    safe_globals: List = [
        TransformerModel, PositionalEncoding,
        torch.nn.modules.dropout.Dropout, torch.nn.modules.linear.Linear,
        torch.nn.modules.transformer.TransformerEncoder,
        torch.nn.modules.transformer.TransformerEncoderLayer,
        torch.nn.modules.activation.MultiheadAttention,
        torch.nn.modules.linear.NonDynamicallyQuantizableLinear,
        torch.nn.modules.normalization.LayerNorm, torch.nn.functional.relu
    ]
    with torch.serialization.safe_globals(safe_globals):
        with open(checkpoint, 'rb') as f:
            model: nn.Module = torch.load(f, map_location=device)
    model.eval()

    # Load corpus
    corpus: Corpus = Corpus(data_path, min_freq=min_freq)
    print(f"Vocabulary size: {len(corpus.dictionary)}")
    ntokens: int = len(corpus.dictionary)

    is_transformer: bool = hasattr(model, 'model_type') and model.model_type == 'Transformer'
    if not is_transformer:
        hidden = model.init_hidden(1)

    input: torch.Tensor = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

    with open(outf, 'w') as outfile:
        with torch.no_grad():
            for i in range(words):
                if is_transformer:
                    output: torch.Tensor = model(input, False)
                    if use_top_k:
                        word_weights: torch.Tensor = output[-1].squeeze().cpu()
                        prob, top_indices = top_k_sampling(word_weights, top_k, temperature)
                        word_idx: int = top_indices[prob.item()].item()
                    else:
                        word_weights: torch.Tensor = output[-1].squeeze().div(temperature).exp().cpu()
                        word_idx: int = torch.multinomial(word_weights, 1)[0].item()
                    word_tensor: torch.Tensor = torch.Tensor([[word_idx]]).long().to(device)
                    input = torch.cat([input, word_tensor], 0)

                else:
                    output, hidden = model(input, hidden)
                    word_weights = output.squeeze().div(temperature).exp().cpu()
                    word_idx = torch.multinomial(word_weights, 1)[0].item()
                    input.fill_(word_idx)

                word: str = corpus.dictionary.idx2word[word_idx]
                if word == '@-@':
                  word = ' '
                outfile.write(word + ('\n' if i % 20 == 19 else ' '))

                if i % log_interval == 0:
                    print(f'| Generated {i}/{words} words')

In [None]:
# Example 1: Train on WikiText-2
print("Training LSTM on WikiText-2...")
train_model(
    model_type='LSTM',
    data_path='/content/drive/MyDrive/data_word_train/wikitext-2',
    emsize=400,
    nhid=400,
    nlayers=4,
    epochs=60,
    lr=0.001
)

# Example 2: Generate text
print("\nGenerating text...")
generate_text(
    checkpoint='model.pt',
    data_path='/content/drive/MyDrive/data_word_train/wikitext-2',
    words=1000,
    temperature=1.0
)

!cat generated.txt

# Example 3: Train on custom names dataset
# First, create the data files (see instructions below)
print("\nTraining on custom names dataset...")
# train_model(
#     model_type='LSTM',
#     data_path='./data/names',
#     emsize=128,
#     nhid=128,
#     nlayers=2,
#     epochs=20,
#     lr=0.001
# )

Training LSTM on WikiText-2...
Using device: cuda
Vocabulary size: 25251
| epoch   1 |   200/ 2983 batches | lr 0.001000 | ms/batch 20.54 | loss  7.12 | ppl  1234.27
| epoch   1 |   400/ 2983 batches | lr 0.001000 | ms/batch 20.50 | loss  6.91 | ppl  1003.33
| epoch   1 |   600/ 2983 batches | lr 0.001000 | ms/batch 20.53 | loss  6.91 | ppl   999.71
| epoch   1 |   800/ 2983 batches | lr 0.001000 | ms/batch 20.65 | loss  6.90 | ppl   995.20
| epoch   1 |  1000/ 2983 batches | lr 0.001000 | ms/batch 20.60 | loss  6.92 | ppl  1013.25
| epoch   1 |  1200/ 2983 batches | lr 0.001000 | ms/batch 20.76 | loss  6.57 | ppl   713.70
| epoch   1 |  1400/ 2983 batches | lr 0.001000 | ms/batch 20.80 | loss  6.31 | ppl   552.32
| epoch   1 |  1600/ 2983 batches | lr 0.001000 | ms/batch 20.79 | loss  6.25 | ppl   516.65
| epoch   1 |  1800/ 2983 batches | lr 0.001000 | ms/batch 20.81 | loss  6.13 | ppl   460.14
| epoch   1 |  2000/ 2983 batches | lr 0.001000 | ms/batch 20.83 | loss  6.08 | ppl   439.

In [None]:
print("Training Transformer on WikiText-2...")
train_model(
    model_type='Transformer',
    data_path='/content/drive/MyDrive/data_word_train/wikitext-2',
    emsize=512,
    nhid=2048,
    nlayers=6,
    epochs=60,
    lr=0.0001,
    batch_size = 32,
    bptt = 50,
    dropout = 0.1,
    nhead = 8,
    save_path='model_1.pt'
)

print("\nGenerating text...")
generate_text(
    checkpoint='model_1.pt',
    data_path='/content/drive/MyDrive/data_word_train/wikitext-2',
    words=1000,
    temperature=1.0
)

In [None]:
print("Training Transformer on WikiText-2...")
train_model(
    model_type='Transformer',
    data_path='/content/drive/MyDrive/data_word_train/wikitext-2',
    emsize=512,
    nhid=2048,
    nlayers=6,
    lr=0.0001,
    clip=0.25,
    epochs=60,
    batch_size=32,
    bptt=50,
    dropout=0.1,
    nhead=8,
    log_interval=200,
    save_path='model_2.pt',
    use_optimizer=True,
    optimizer=None,  # Use default AdamW
    criterion=LabelSmoothingLoss(smoothing=0.1),
    use_label_smoothing=True,
    use_warmup=True,
    warmup_steps=4000,
    min_freq=5
)

print("\nGenerating text...")
generate_text(
    checkpoint='model_2.pt',
    data_path='/content/drive/MyDrive/data_word_train/wikitext-2',
    outf='generated.txt',
    words=1000,
    temperature=0.8,
    top_k=40,
    seed=1111,
    log_interval=100,
    min_freq=5
)

In [14]:
# ### OLD CODE
# DATA_PATH = '/content/drive/MyDrive/data_word_train/wikitext-2'
# if not os.path.exists(DATA_PATH):
#     print("Downloading Wikitext-2 dataset...")
#     !wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip -P /content/
#     !unzip /content/wikitext-2-v1.zip -d /content/data_word_train/
#     !mkdir -p /content/drive/MyDrive/data_word_train/
#     !mv /content/data_word_train/wikitext-2 /content/drive/MyDrive/data_word_train/
#     print("Wikitext-2 dataset moved to Google Drive")

# # Step 5: Training Function from main.py (adapted for Colab)
# class Args:
#     def __init__(self):
#         self.data = '/content/drive/MyDrive/data_word_train/wikitext-2'  # Wikitext-2 path
#         self.model = 'LSTM'  # RNN_TANH, RNN_RELU, LSTM, GRU, Transformer
#         self.emsize = 400
#         self.nhid = 400
#         self.nlayers = 4
#         self.lr = 0.001
#         self.clip = 0.25
#         self.epochs = 60
#         self.batch_size = 20
#         self.bptt = 35
#         self.dropout = 0.2
#         self.tied = False
#         self.seed = 1111
#         self.log_interval = 200
#         self.save = 'model.pt'
#         self.onnx_export = ''
#         self.nhead = 4
#         self.dry_run = False
#         self.accel = True
#         self.use_optimizer = True  # Use AdamW

# args = Args()

# torch.manual_seed(args.seed)

# if args.accel and torch.cuda.is_available():
#     device = torch.device("cuda")
# else:
#     device = torch.device("cpu")

# print("Using device:", device)

# # Load data
# corpus = Corpus(args.data)

# def batchify(data, bsz):
#     nbatch = data.size(0) // bsz
#     data = data.narrow(0, 0, nbatch * bsz)
#     data = data.view(bsz, -1).t().contiguous()
#     return data.to(device)

# eval_batch_size = 10
# train_data = batchify(corpus.train, args.batch_size)
# val_data = batchify(corpus.valid, eval_batch_size)
# test_data = batchify(corpus.test, eval_batch_size)

# # Build the model
# ntokens = len(corpus.dictionary)
# if args.model == 'Transformer':
#     model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device)
# else:
#     model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)

# # Label smoothing loss
# class LabelSmoothingLoss(nn.Module):
#     def __init__(self, smoothing=0.0):
#         super(LabelSmoothingLoss, self).__init__()
#         self.smoothing = smoothing
#         self.confidence = 1.0 - smoothing

#     def forward(self, output, target):
#         log_probs = output
#         n_classes = log_probs.size(-1)
#         with torch.no_grad():
#             true_dist = torch.zeros_like(log_probs)
#             true_dist.fill_(self.smoothing / (n_classes - 1))
#             true_dist.scatter_(1, target.unsqueeze(1), self.confidence)
#         return torch.mean(torch.sum(-true_dist * log_probs, dim=-1))

# criterion = nn.NLLLoss()
# if args.use_optimizer:
#     optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=1e-5)
#     scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

# # Training code
# def repackage_hidden(h):
#     if isinstance(h, torch.Tensor):
#         return h.detach()
#     else:
#         return tuple(repackage_hidden(v) for v in h)

# def get_batch(source, i):
#     seq_len = min(args.bptt, len(source) - 1 - i)
#     data = source[i:i+seq_len]
#     target = source[i+1:i+1+seq_len].view(-1)
#     return data, target

# def evaluate(data_source):
#     model.eval()
#     total_loss = 0.
#     if args.model != 'Transformer':
#         hidden = model.init_hidden(eval_batch_size)
#     with torch.no_grad():
#         for i in range(0, data_source.size(0) - 1, args.bptt):
#             data, targets = get_batch(data_source, i)
#             if args.model == 'Transformer':
#                 output = model(data)
#                 output = output.view(-1, ntokens)
#             else:
#                 output, hidden = model(data, hidden)
#                 hidden = repackage_hidden(hidden)
#             total_loss += len(data) * criterion(output, targets).item()
#     return total_loss / (len(data_source) - 1)

# def train_func():
#     model.train()
#     total_loss = 0.
#     start_time = time.time()
#     if args.model != 'Transformer':
#         hidden = model.init_hidden(args.batch_size)
#     for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
#         data, targets = get_batch(train_data, i)
#         optimizer.zero_grad() if args.use_optimizer else model.zero_grad()
#         if args.model == 'Transformer':
#             output = model(data)
#             output = output.view(-1, ntokens)
#         else:
#             hidden = repackage_hidden(hidden)
#             output, hidden = model(data, hidden)
#         loss = criterion(output, targets)
#         loss.backward()

#         torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
#         if args.use_optimizer:
#             optimizer.step()
#         else:
#             for p in model.parameters():
#                 p.data.add_(p.grad, alpha=-args.lr)

#         total_loss += loss.item()

#         if batch % args.log_interval == 0 and batch > 0:
#             cur_loss = total_loss / args.log_interval
#             elapsed = time.time() - start_time
#             print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
#                   'loss {:5.2f} | ppl {:8.2f}'.format(
#                 epoch, batch, len(train_data) // args.bptt, args.lr,
#                 elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
#             total_loss = 0
#             start_time = time.time()
#         if args.dry_run:
#             break

# lr = args.lr
# best_val_loss = None

# try:
#     for epoch in range(1, args.epochs + 1):
#         epoch_start_time = time.time()
#         train_func()
#         val_loss = evaluate(val_data)
#         print('-' * 89)
#         print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
#               'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
#                                          val_loss, math.exp(val_loss)))
#         print('-' * 89)
#         if not best_val_loss or val_loss < best_val_loss:
#             with open(args.save, 'wb') as f:
#                 torch.save(model, f)
#             best_val_loss = val_loss
#         else:
#             # lr /= 4.0
#             scheduler.step(val_loss)
#         print(f"Current learning rate: {optimizer.param_groups[0]['lr']}")
# except KeyboardInterrupt:
#     print('-' * 89)
#     print('Exiting from training early')

Using device: cuda
| epoch   1 |   200/ 2983 batches | lr 0.00 | ms/batch 25.46 | loss  7.33 | ppl  1522.35
| epoch   1 |   400/ 2983 batches | lr 0.00 | ms/batch 25.41 | loss  7.10 | ppl  1207.10
| epoch   1 |   600/ 2983 batches | lr 0.00 | ms/batch 25.48 | loss  7.09 | ppl  1196.08
| epoch   1 |   800/ 2983 batches | lr 0.00 | ms/batch 25.48 | loss  7.09 | ppl  1196.80
| epoch   1 |  1000/ 2983 batches | lr 0.00 | ms/batch 25.38 | loss  7.10 | ppl  1214.24
| epoch   1 |  1200/ 2983 batches | lr 0.00 | ms/batch 25.36 | loss  7.11 | ppl  1228.74
| epoch   1 |  1400/ 2983 batches | lr 0.00 | ms/batch 25.37 | loss  6.94 | ppl  1031.50
| epoch   1 |  1600/ 2983 batches | lr 0.00 | ms/batch 25.36 | loss  6.63 | ppl   758.50
| epoch   1 |  1800/ 2983 batches | lr 0.00 | ms/batch 25.33 | loss  6.46 | ppl   636.73
| epoch   1 |  2000/ 2983 batches | lr 0.00 | ms/batch 25.31 | loss  6.40 | ppl   602.21
| epoch   1 |  2200/ 2983 batches | lr 0.00 | ms/batch 25.31 | loss  6.26 | ppl   524.79
| 

In [19]:
# # Load the best saved model with safe globals
# safe_globals = [
#     RNNModel,
#     TransformerModel,
#     PositionalEncoding,
#     torch.nn.modules.dropout.Dropout,
#     torch.nn.modules.linear.Linear,
#     torch.nn.modules.rnn.GRU,
#     torch.nn.modules.rnn.LSTM,
#     torch.nn.modules.rnn.RNN,
#     torch.nn.modules.sparse.Embedding,
#     torch.nn.modules.transformer.TransformerEncoder,
#     torch.nn.modules.transformer.TransformerEncoderLayer,
#     torch.nn.modules.activation.MultiheadAttention,
#     torch.nn.modules.linear.NonDynamicallyQuantizableLinear,
#     torch.nn.modules.normalization.LayerNorm,
#     torch.nn.functional.relu
# ]

# with torch.serialization.safe_globals(safe_globals):
#     with open(args.save, 'rb') as f:
#         model = torch.load(f, map_location=device)

# # Run on test data
# test_loss = evaluate(test_data)
# print('=' * 89)
# print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
#     test_loss, math.exp(test_loss)))
# print('=' * 89)

# # Step 6: Generate Text
# checkpoint = 'model.pt'
# outf = 'generated.txt'
# words = 1000
# temperature = 1.0
# log_interval = 100
# accel = True

# torch.manual_seed(1111)

# if accel and torch.cuda.is_available():
#     device = torch.device("cuda")
# else:
#     device = torch.device("cpu")

# with torch.serialization.safe_globals(safe_globals):
#     with open(checkpoint, 'rb') as f:
#         model = torch.load(f, map_location=device)
# model.eval()

# corpus = Corpus(args.data)
# ntokens = len(corpus.dictionary)

# is_transformer_model = hasattr(model, 'model_type') and model.model_type == 'Transformer'
# if not is_transformer_model:
#     hidden = model.init_hidden(1)
# input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

# with open(outf, 'w') as outf:
#     with torch.no_grad():
#         for i in range(words):
#             if is_transformer_model:
#                 output = model(input, False)
#                 word_weights = output[-1].squeeze().div(temperature).exp().cpu()
#                 word_idx = torch.multinomial(word_weights, 1)[0]
#                 word_tensor = torch.Tensor([[word_idx]]).long().to(device)
#                 input = torch.cat([input, word_tensor], 0)
#             else:
#                 output, hidden = model(input, hidden)
#                 word_weights = output.squeeze().div(temperature).exp().cpu()
#                 word_idx = torch.multinomial(word_weights, 1)[0]
#                 input.fill_(word_idx)

#             word = corpus.dictionary.idx2word[word_idx]
#             if word == '@-@':
#                 word = ' '
#             outf.write(word + ('\n' if i % 20 == 19 else ' '))
#             if i % log_interval == 0:
#                 print('| Generated {}/{} words'.format(i, words))

# # Print generated text
# !cat generated.txt

| End of training | test loss  5.09 | test ppl   162.30
| Generated 0/1000 words
| Generated 100/1000 words
| Generated 200/1000 words
| Generated 300/1000 words
| Generated 400/1000 words
| Generated 500/1000 words
| Generated 600/1000 words
| Generated 700/1000 words
| Generated 800/1000 words
| Generated 900/1000 words
to do not incorporate , there was a little written gate in the top seven episode . It usually included
as Main   T 12 Artists of Fresh Superior del <unk> ; an alternate character service was called to 9th
visual Christian Mission records . 5 of 05 . <eos> <eos> = = = Post   time and philosophy of
the ballots = = = <eos> <eos> The proper German script was accomplished by German forces between the " Soviet
Republic " . <eos> <eos> = = = Controversies = = = <eos> <eos> During the 1950s , a nursery
objector conducted by Arthur Dooley of the Death   Roman <unk> in February , having stated that its fifth seasons
of the Metro ranges through Paris . Bill Woodward was offered 

### MAIN PART

In [None]:
# Step 4: Data Preparation for Custom Data (English-French dataset as example)
# Extract French sentences and create train/valid/test.txt
DATA_PATH = '/content/drive/MyDrive/data_word_train/custom'
INPUT_FILE = '/content/drive/MyDrive/data/eng-fra.txt'
os.makedirs(DATA_PATH, exist_ok=True)
random.seed(1111)

with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    lines = f.readlines()
french_sentences = [line.strip().split('\t')[1] for line in lines if len(line.strip().split('\t')) == 2]

random.shuffle(french_sentences)
n = len(french_sentences)
train_end = int(n * 0.8)
valid_end = train_end + int(n * 0.1)
train_data = french_sentences[:train_end]
valid_data = french_sentences[train_end:valid_end]
test_data = french_sentences[valid_end:]

def save_sentences(sentences, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            sentence = sentence.replace('.', ' .').replace(',', ' ,').replace('!', ' !').replace('?', ' ?')
            f.write(sentence + ' <eos>\n')

save_sentences(train_data, os.path.join(DATA_PATH, 'train.txt'))
save_sentences(valid_data, os.path.join(DATA_PATH, 'valid.txt'))
save_sentences(test_data, os.path.join(DATA_PATH, 'test.txt'))

print(f"Created datasets: {len(train_data)} train, {len(valid_data)} valid, {len(test_data)} test sentences")

# Step 5: Training Function from main.py (adapted for Colab)
# Define args as a class for Colab
class Args:
    def __init__(self):
        self.data = '/content/drive/MyDrive/data_word_train/custom'  # Custom data path
        self.model = 'LSTM'  # RNN_TANH, RNN_RELU, LSTM, GRU, Transformer
        self.emsize = 200
        self.nhid = 200
        self.nlayers = 2
        self.lr = 0.001
        self.clip = 0.25
        self.epochs = 20  # Reduced for faster training
        self.batch_size = 20
        self.bptt = 35
        self.dropout = 0.2
        self.tied = False
        self.seed = 1111
        self.log_interval = 200
        self.save = 'model.pt'
        self.onnx_export = ''
        self.nhead = 2
        self.dry_run = False
        self.accel = True
        self.use_optimizer = True  # Use AdamW

args = Args()

# Set the random seed manually for reproducibility.

torch.manual_seed(args.seed)

if args.accel and torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print("Using device:", device)


###############################################################################
# Load data
###############################################################################

corpus = Corpus(args.data)

# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# ┌ a g m s ┐
# │ b h n t │
# │ c i o u │
# │ d j p v │
# │ e k q w │
# └ f l r x ┘.
# These columns are treated as independent by the model, which means that the
# dependence of e. g. 'g' on 'f' can not be learned, but allows more efficient
# batch processing.

def batchify(data, bsz):
    nbatch = data.size(0) // bsz
    data = data.narrow(0, 0, nbatch * bsz)
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

eval_batch_size = 10
train_data = batchify(corpus.train, args.batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

###############################################################################
# Build the model
###############################################################################

ntokens = len(corpus.dictionary)
if args.model == 'Transformer':
    model = TransformerModel(ntokens, args.emsize, args.nhead, args.nhid, args.nlayers, args.dropout).to(device)
else:
    model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(device)

criterion = nn.NLLLoss()
if args.use_optimizer:
    optimizer = optim.AdamW(model.parameters(), lr=args.lr)

###############################################################################
# Training code
###############################################################################

def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.

def get_batch(source, i):
    seq_len = min(args.bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

def evaluate(data_source):
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)
    if args.model != 'Transformer':
        hidden = model.init_hidden(eval_batch_size)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, args.bptt):
            data, targets = get_batch(data_source, i)
            if args.model == 'Transformer':
                output = model(data)
                output = output.view(-1, ntokens)
            else:
                output, hidden = model(data, hidden)
                hidden = repackage_hidden(hidden)
            total_loss += len(data) * criterion(output, targets).item()
    return total_loss / (len(data_source) - 1)

def train_func():
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    if args.model != 'Transformer':
        hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, args.bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad() if args.use_optimizer else model.zero_grad()
        if args.model == 'Transformer':
            output = model(data)
            output = output.view(-1, ntokens)
        else:
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden)
        loss = criterion(output, targets)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
        if args.use_optimizer:
            optimizer.step()
        else:
            for p in model.parameters():
                p.data.add_(p.grad, alpha=-lr)

        total_loss += loss.item()

        if batch % args.log_interval == 0 and batch > 0:
            cur_loss = total_loss / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // args.bptt, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
        if args.dry_run:
            break

lr = args.lr
best_val_loss = None

try:
    for epoch in range(1, args.epochs + 1):
        epoch_start_time = time.time()
        train_func()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
              'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                         val_loss, math.exp(val_loss)))
        print('-' * 89)
        if not best_val_loss or val_loss < best_val_loss:
            with open(args.save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Load the best saved model.
with open(args.save, 'rb') as f:
    model = torch.load(f)

# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)

# Step 6: Generate Text from generate.py (adapted for Colab)
checkpoint = 'model.pt'  # Your saved model
outf = 'generated.txt'
words = 1000
temperature = 1.0
log_interval = 100
accel = True  # Use CUDA

torch.manual_seed(1111)

if accel and torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

with open(checkpoint, 'rb') as f:
    model = torch.load(f, map_location=device)
model.eval()

corpus = Corpus(args.data)
ntokens = len(corpus.dictionary)

is_transformer_model = hasattr(model, 'model_type') and model.model_type == 'Transformer'
if not is_transformer_model:
    hidden = model.init_hidden(1)
input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

with open(outf, 'w') as outf:
    with torch.no_grad():
        for i in range(words):
            if is_transformer_model:
                output = model(input, False)
                word_weights = output[-1].squeeze().div(temperature).exp().cpu()
                word_idx = torch.multinomial(word_weights, 1)[0]
                word_tensor = torch.Tensor([[word_idx]]).long().to(device)
                input = torch.cat([input, word_tensor], 0)
            else:
                output, hidden = model(input, hidden)
                word_weights = output.squeeze().div(temperature).exp().cpu()
                word_idx = torch.multinomial(word_weights, 1)[0]
                input.fill_(word_idx)

            word = corpus.dictionary.idx2word[word_idx]

            outf.write(word + ('\n' if i % 20 == 19 else ' '))

            if i % log_interval == 0:
                print('| Generated {}/{} words'.format(i, words))

# Print generated text
!head -n 20 generated.txt

In [None]:
"""
Script to prepare names dataset for Word-Level Language Modeling
Converts multiple text files with names into train/valid/test splits
"""

import os
import random
from typing import List, Tuple
from pathlib import Path


def read_names_from_files(data_dir: str) -> List[str]:
    """
    Read all names from text files in the directory.

    Args:
        data_dir: Directory containing .txt files with names

    Returns:
        List of all names
    """
    all_names: List[str] = []

    for filename in Path(data_dir).glob('*.txt'):
        print(f"Reading {filename.name}...")
        with open(filename, 'r', encoding='utf-8') as f:
            names: List[str] = [line.strip() for line in f if line.strip()]
            all_names.extend(names)
            print(f"  Found {len(names)} names")

    print(f"\nTotal names: {len(all_names)}")
    return all_names


def create_train_valid_test_splits(
    names: List[str],
    train_ratio: float = 0.8,
    valid_ratio: float = 0.1,
    test_ratio: float = 0.1,
    seed: int = 42
) -> Tuple[List[str], List[str], List[str]]:
    """
    Split names into train, validation, and test sets.

    Args:
        names: List of names
        train_ratio: Proportion for training (default 0.8)
        valid_ratio: Proportion for validation (default 0.1)
        test_ratio: Proportion for testing (default 0.1)
        seed: Random seed for reproducibility

    Returns:
        Tuple of (train_names, valid_names, test_names)
    """
    assert abs(train_ratio + valid_ratio + test_ratio - 1.0) < 1e-6, \
        "Ratios must sum to 1.0"

    random.seed(seed)
    names_copy: List[str] = names.copy()
    random.shuffle(names_copy)

    n: int = len(names_copy)
    train_end: int = int(n * train_ratio)
    valid_end: int = train_end + int(n * valid_ratio)

    train_names: List[str] = names_copy[:train_end]
    valid_names: List[str] = names_copy[train_end:valid_end]
    test_names: List[str] = names_copy[valid_end:]

    print(f"\nSplit sizes:")
    print(f"  Train: {len(train_names)} ({len(train_names)/n*100:.1f}%)")
    print(f"  Valid: {len(valid_names)} ({len(valid_names)/n*100:.1f}%)")
    print(f"  Test:  {len(test_names)} ({len(test_names)/n*100:.1f}%)")

    return train_names, valid_names, test_names


def save_names_to_file(names: List[str], filename: str) -> None:
    """
    Save names to file, one per line.

    Args:
        names: List of names
        filename: Output filename
    """
    with open(filename, 'w', encoding='utf-8') as f:
        for name in names:
            # Write each name as a separate "sentence"
            # The model will add <eos> automatically
            f.write(name + '\n')
    print(f"Saved {len(names)} names to {filename}")


def prepare_names_dataset(
    input_dir: str,
    output_dir: str,
    train_ratio: float = 0.8,
    valid_ratio: float = 0.1,
    test_ratio: float = 0.1,
    seed: int = 42
) -> None:
    """
    Main function to prepare names dataset.

    Args:
        input_dir: Directory containing raw name files
        output_dir: Directory to save train/valid/test files
        train_ratio: Proportion for training
        valid_ratio: Proportion for validation
        test_ratio: Proportion for testing
        seed: Random seed
    """
    print("="*60)
    print("Preparing Names Dataset for Language Modeling")
    print("="*60)

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)

    # Read all names
    all_names: List[str] = read_names_from_files(input_dir)

    # Split into train/valid/test
    train_names, valid_names, test_names = create_train_valid_test_splits(
        all_names, train_ratio, valid_ratio, test_ratio, seed
    )

    # Save splits
    print("\nSaving splits...")
    save_names_to_file(train_names, os.path.join(output_dir, 'train.txt'))
    save_names_to_file(valid_names, os.path.join(output_dir, 'valid.txt'))
    save_names_to_file(test_names, os.path.join(output_dir, 'test.txt'))

    print("\n" + "="*60)
    print("Dataset preparation complete!")
    print(f"Output directory: {output_dir}")
    print("="*60)

    # Print sample names
    print("\nSample names from train set:")
    for name in train_names[:10]:
        print(f"  {name}")


# ===============================
# ALTERNATIVE: Word-level format
# ===============================

def prepare_wordlevel_format(
    input_dir: str,
    output_dir: str,
    train_ratio: float = 0.8,
    valid_ratio: float = 0.1,
    test_ratio: float = 0.1,
    seed: int = 42
) -> None:
    """
    Prepare dataset where each character is a 'word'.
    This allows character-level language modeling using word LM code.

    Args:
        input_dir: Directory containing raw name files
        output_dir: Directory to save train/valid/test files
        train_ratio: Proportion for training
        valid_ratio: Proportion for validation
        test_ratio: Proportion for testing
        seed: Random seed
    """
    print("="*60)
    print("Preparing Character-Level (as words) Dataset")
    print("="*60)

    os.makedirs(output_dir, exist_ok=True)

    # Read all names
    all_names: List[str] = read_names_from_files(input_dir)

    # Split
    train_names, valid_names, test_names = create_train_valid_test_splits(
        all_names, train_ratio, valid_ratio, test_ratio, seed
    )

    # Convert to character-level
    def names_to_char_words(names: List[str], filename: str) -> None:
        """Convert names to space-separated characters."""
        with open(filename, 'w', encoding='utf-8') as f:
            for name in names:
                # Each character becomes a "word"
                char_sequence: str = ' '.join(list(name))
                f.write(char_sequence + '\n')
        print(f"Saved {len(names)} names (as char sequences) to {filename}")

    print("\nSaving character-level splits...")
    names_to_char_words(train_names, os.path.join(output_dir, 'train.txt'))
    names_to_char_words(valid_names, os.path.join(output_dir, 'valid.txt'))
    names_to_char_words(test_names, os.path.join(output_dir, 'test.txt'))

    print("\n" + "="*60)
    print("Character-level dataset preparation complete!")
    print