In [1]:
import requests
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Prep

## Data

In [2]:
url = "https://raw.githubusercontent.com/DrUzair/NLP/master/data/WarrenBuffet.txt"
response = requests.get(url)
text = response.text

In [106]:
text[:500]

'berkshire hathaway inc. to the shareholders of berkshire hathaway inc. our gain in net worth during #### was ##.# billion which increased the pershare book value of both our class a and class b stock by ##.#%. over the last ## years that is since present management took over book value has grown from ## to ##### a rate of ##.#% compounded annually. we believe that ##.# billion is a record for a oneyear gain in net worth  more than has ever been booked by any american business leaving aside boost'

In [3]:
# Step 2: Preprocess the Text
def preprocess_text(text):
    text = re.sub(r'\d', '#', text)  # Replace all digits with #
    text = re.sub(r'\r\n', ' ', text)  # Replace newline characters with spaces
    text = re.sub(r' +', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^a-zA-Z\s\.\%\#\']', '', text)  # Remove special characters except periods, %, and '
    text = text.strip()  # Remove leading and trailing spaces
    text = text.lower()  # Convert to lowercase
    return text


text = preprocess_text(text)

In [105]:
text[:500]

'berkshire hathaway inc. to the shareholders of berkshire hathaway inc. our gain in net worth during #### was ##.# billion which increased the pershare book value of both our class a and class b stock by ##.#%. over the last ## years that is since present management took over book value has grown from ## to ##### a rate of ##.#% compounded annually. we believe that ##.# billion is a record for a oneyear gain in net worth  more than has ever been booked by any american business leaving aside boost'

In [90]:
clean_sentences = nltk.sent_tokenize(text)

In [91]:
len(clean_sentences)

2656

In [92]:
clean_sentences = clean_sentences[:100]

In [93]:
# Tokenization and Vocabulary Creation
def tokenize_sentence(sentence):
    return sentence.split()

tokens = [token for sentence in clean_sentences for token in tokenize_sentence(sentence)]
vocab = Counter(tokens)
vocab_size = len(vocab)
print(f"vocab_size: {vocab_size}")
# Create word to index and index to word mappings
word_to_idx = {word: i+1 for i, (word, _) in enumerate(vocab.items())}
idx_to_word = {i+1: word for i, (word, _) in enumerate(vocab.items())}
print(f"word_to_idx: {len(word_to_idx)}")
# Prepare input sequences from sentences
sequences = []
for sentence in clean_sentences:
    tokenized_sentence = tokenize_sentence(sentence)
    seq = [word_to_idx[word] for word in tokenized_sentence if word in word_to_idx]
    for i in range(1, len(seq)):
        sequences.append(seq[:i+1])

# Pad sequences
max_sequence_len = max(len(seq) for seq in sequences)
print(f"max_sequence_len: {max_sequence_len}")
sequences = [np.pad(seq, (max_sequence_len - len(seq), 0), 'constant') for seq in sequences]
print(f"sequences: {len(sequences)}")

vocab_size: 776
word_to_idx: 776
max_sequence_len: 56
sequences: 1772


In [94]:
def prep_dataloader(sequences):
  # Split sequences into input (X) and output (y)
  X = np.array([seq[:-1] for seq in sequences])
  y = np.array([seq[-1] for seq in sequences])

  split_ratio = 0.8
  split_point = int(len(X) * split_ratio)

  # Split data into training and test sets
  X_train, X_test = X[:split_point], X[split_point:]
  y_train, y_test = y[:split_point], y[split_point:]

  # Convert to PyTorch tensors
  X_train = torch.tensor(X_train, dtype=torch.long)
  y_train = torch.tensor(y_train, dtype=torch.long)

  dataset = torch.utils.data.TensorDataset(X_train, y_train)
  train_dataloader = DataLoader(dataset, batch_size=64, shuffle=False)

  # Convert to PyTorch tensors
  X_test = torch.tensor(X_test, dtype=torch.long)
  y_test = torch.tensor(y_test, dtype=torch.long)

  dataset = torch.utils.data.TensorDataset(X_test, y_test)
  test_dataloader = DataLoader(dataset, batch_size=64, shuffle=False)

  return train_dataloader, test_dataloader

## Training function

In [95]:
# Training function
def train_model(model, dataloader, num_epochs=100, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()
    for epoch in range(num_epochs):
        #print(f"epoch {epoch}")
        for inputs, targets in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

## Text generation function

In [96]:
def sample_with_temperature(predictions, temperature=1.0):
    predictions = predictions / temperature
    probabilities = torch.softmax(predictions, dim=-1)
    return torch.multinomial(probabilities, 1)

def sample_top_k(logits, k=5):
    values, indices = torch.topk(logits, k)
    probs = torch.softmax(values, dim=-1)
    next_token = torch.multinomial(probs, 1)
    return indices.gather(-1, next_token)

def sample_top_p(logits, p=0.9):
    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
    sorted_indices_to_remove = cumulative_probs > p
    if sorted_indices_to_remove[:, 1:].sum():
        sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
        sorted_indices_to_remove[:, 0] = 0

    indices_to_remove = sorted_indices[sorted_indices_to_remove]
    logits[indices_to_remove] = float('-inf')
    probs = torch.softmax(logits, dim=-1)
    next_token = torch.multinomial(probs, 1)
    return next_token

def generate_text(model, seed_text, next_words, max_sequence_len,
                  word_to_idx, idx_to_word,
                  temperature=1.0,
                  top_k=None,
                  top_p=None):
    model.eval()
    words = seed_text.split()
    for _ in range(next_words):
        token_list = [word_to_idx[word] for word in words[-max_sequence_len+1:] if word in word_to_idx]
        token_list = torch.tensor(token_list).unsqueeze(0)
        with torch.no_grad():
            outputs = model(token_list)
            logits = outputs.squeeze(0)

            if temperature != 1.0:
                logits = logits / temperature
                probabilities = torch.softmax(logits, dim=-1)
                next_token = torch.multinomial(probabilities, 1)
            elif top_k is not None:
                next_token = sample_top_k(logits, top_k)
            elif top_p is not None:
                next_token = sample_top_p(logits, top_p)
            else:
                probabilities = torch.softmax(logits, dim=-1)
                next_token = torch.multinomial(probabilities, 1)

            words.append(idx_to_word[next_token.item()])
    return ' '.join(words)

## Perplexity

The following function calculates the perplexity of a model on a given test dataset. Perplexity is a common metric used to evaluate language models. It measures how well a probability distribution or model predicts a sample.

### Mathematical Explanation

Perplexity can be interpreted as the exponentiation of the average negative log probability (cross-entropy loss) per word in the sequence.

#### Perplexity Definition

Perplexity for a sequence of words $ W = w_1, w_2, \ldots, w_N $ is defined as:

$
\text{Perplexity}(W) = P(W)^{-\frac{1}{N}} = \left( \frac{1}{P(w_1, w_2, \ldots, w_N)} \right)^{\frac{1}{N}}
$

This can be rewritten as:

$
\text{Perplexity}(W) = e^{-\frac{1}{N} \log P(W)}
$

#### Connection to Cross-Entropy Loss

The cross-entropy loss for the sequence $ W $ is:

$
\mathcal{L} = -\frac{1}{N} \sum_{t=1}^N \log P(w_t | w_{1:t-1})
$

Given the total log probability of the sequence is:

$
\log P(W) = \sum_{t=1}^N \log P(w_t | w_{1:t-1})
$

The average log probability per word (cross-entropy loss) is:

$
\text{avg_loss} = -\frac{1}{N} \log P(W)
$

### Relating Perplexity to Cross-Entropy Loss

Since we have:

$
\text{avg_loss} = -\frac{1}{N} \log P(W)
$

Exponentiating both sides, we get:

$
e^{\text{avg_loss}} = e^{-\frac{1}{N} \log P(W)}
$

But from the definition of perplexity, we know:

$
\text{Perplexity}(W) = e^{-\frac{1}{N} \log P(W)}
$

Thus, we can see that:

$
\text{Perplexity}(W) = e^{\text{avg_loss}}
$


1. **Cross-Entropy Loss**:
   The cross-entropy loss measures the difference between the true distribution (the actual data) and the predicted distribution (the model's output). For language modeling, this loss can be expressed as:

   $
   \mathcal{L} = - \sum_{t=1}^N \log P(w_t | w_{1:t-1})
   $

   Here, $N$ is the total number of tokens, $w_t$ is the $t$-th token, and $P(w_t | w_{1:t-1})$ is the probability of the $ t$-th token given the previous tokens.

2. **Average Loss**:
   The average loss is obtained by dividing the total loss by the total number of tokens:

   $
   \text{avg_loss} = \frac{\mathcal{L}}{N}
   $

3. **Perplexity**:
   Perplexity is the exponentiation of the average loss:

   $
   \text{Perplexity} = e^{\text{avg_loss}}
   $

   This can be interpreted as the geometric mean of the inverse probabilities of the tokens. A lower perplexity indicates a better predictive model.

### Example Calculation

Given a test dataset with three batches:
- Batch 1: `inputs1`, `targets1`
- Batch 2: `inputs2`, `targets2`
- Batch 3: `inputs3`, `targets3`

Assume the total loss calculated is $\mathcal{L} = 300$ and the total number of tokens $N = 1000$:

1. **Average Loss**:
   $
   \text{avg_loss} = \frac{300}{1000} = 0.3
   $

2. **Perplexity**:
   $
   \text{Perplexity} = e^{0.3} \approx 1.35
   $

This indicates that, on average, the model is $ 1.35 $ times as uncertain as a perfect model would be when predicting each token in the test dataset.

In [84]:
import torch
import torch.nn as nn
import numpy as np

# Assuming you have the LSTM and RNN models trained and ready for evaluation

def calculate_perplexity(model, test_data):
    criterion = nn.CrossEntropyLoss(reduction='sum')  # Use sum reduction for total loss
    model.eval()  # Set model to evaluation mode
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for inputs, targets in test_data:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            total_tokens += targets.size(0)  # Count total tokens


    avg_loss = total_loss / total_tokens
    perplexity = np.exp(avg_loss)

    return perplexity

# RNN Model

In [97]:
# Define the RNN Model
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])
        return out

### Train

In [70]:
rnn_model = RNNModel(vocab_size=vocab_size+1,
                     embed_size=10,
                     hidden_size=40)
train_dataloader, test_dataloader = prep_dataloader(sequences)
# Train RNN Model
train_model(rnn_model,
            train_dataloader,
            num_epochs=200,
            lr=0.001)

Epoch [10/200], Loss: 5.1606
Epoch [20/200], Loss: 3.6107
Epoch [30/200], Loss: 2.2795
Epoch [40/200], Loss: 1.5259
Epoch [50/200], Loss: 1.0516
Epoch [60/200], Loss: 0.7599
Epoch [70/200], Loss: 0.5968
Epoch [80/200], Loss: 0.4609
Epoch [90/200], Loss: 0.3938
Epoch [100/200], Loss: 0.3353
Epoch [110/200], Loss: 0.3066
Epoch [120/200], Loss: 0.2925
Epoch [130/200], Loss: 0.2433
Epoch [140/200], Loss: 0.2296
Epoch [150/200], Loss: 0.2077
Epoch [160/200], Loss: 0.2023
Epoch [170/200], Loss: 0.1768
Epoch [180/200], Loss: 0.1712
Epoch [190/200], Loss: 0.1928
Epoch [200/200], Loss: 0.1522


In [71]:
torch.save(rnn_model, 'rnn-warrenbuffet.pt')

### Generate

In [73]:
rnn_model = torch.load('rnn-warrenbuffet.pt')

In [83]:
# Generate text
seed_text = "this year"
print("RNN generated text: ", generate_text(rnn_model,
                                            seed_text,
                                            50,
                                            max_sequence_len,
                                            word_to_idx,
                                            idx_to_word))
# Assuming test_data is prepared as batches of input sequences and corresponding targets
test_perplexity_rnn = calculate_perplexity(rnn_model, train_dataloader)
print(f"Worst case Perplexity: {vocab_size}")
print(f"RNN Perplexity: {test_perplexity_rnn}")

RNN generated text:  this year run what have management. rules bad markets would have away use the second of the board and might a offer with and more that lets them maximize for calendar credit card. of at us of year our globetrotting finally got underway. on positive of his year. netjets all in is
Outputs (logits) shape: torch.Size([64, 4254])
Targets shape: torch.Size([64])
Outputs (logits) sample: tensor([-15.4693,   6.7014,  10.6829,  ..., -15.5069, -15.3461, -15.4138])
Targets sample: 2
Total loss: 22075.084812283516
Total tokens: 15435
Average loss: 1.430196618871624
Worst case Perplexity: 4253
RNN Perplexity: 4.179520883820638


# LSTM Model

In [87]:
import torch
import torch.nn as nn

class TextbookLSTMWithEmbedding(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(TextbookLSTMWithEmbedding, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size

        # Embedding layer to convert input indices to dense vectors
        self.embedding = nn.Embedding(vocab_size, embed_size)

        # Input gate weights and biases
        self.W_i = nn.Parameter(torch.Tensor(embed_size, hidden_size))
        self.U_i = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_i = nn.Parameter(torch.Tensor(hidden_size))

        # Forget gate weights and biases
        self.W_f = nn.Parameter(torch.Tensor(embed_size, hidden_size))
        self.U_f = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_f = nn.Parameter(torch.Tensor(hidden_size))

        # Cell gate weights and biases
        self.W_c = nn.Parameter(torch.Tensor(embed_size, hidden_size))
        self.U_c = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_c = nn.Parameter(torch.Tensor(hidden_size))

        # Output gate weights and biases
        self.W_o = nn.Parameter(torch.Tensor(embed_size, hidden_size))
        self.U_o = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.b_o = nn.Parameter(torch.Tensor(hidden_size))

        # Fully connected layer to map hidden state to vocab size
        self.fc = nn.Linear(hidden_size, vocab_size)

        # Initialize weights
        self.init_weights()

    def init_weights(self):
        # Initialize weights using Kaiming normal initialization for better training stability
        for param in [self.W_i, self.U_i, self.W_f, self.U_f, self.W_c, self.U_c, self.W_o, self.U_o]:
            nn.init.kaiming_normal_(param)

        # Initialize biases to zero
        for param in [self.b_i, self.b_f, self.b_c, self.b_o]:
            nn.init.zeros_(param)

    def forward(self, x, hidden_state=None):
        batch_size, seq_len = x.size()

        # Embed the input words
        x = self.embedding(x)

        # Initialize hidden state and cell state if not provided
        if hidden_state is None:
            h_t = torch.zeros(batch_size, self.hidden_size).to(x.device)  # Hidden state
            c_t = torch.zeros(batch_size, self.hidden_size).to(x.device)  # Cell state
        else:
            h_t, c_t = hidden_state

        outputs = []
        for t in range(seq_len):
            x_t = x[:, t, :]  # Get input at time step t

            # Forget gate: decides how much of the previous cell state should be retained
            f_t = torch.sigmoid(x_t @ self.W_f + h_t @ self.U_f + self.b_f)

            # Input gate: decides how much of the input should go into the cell state
            i_t = torch.sigmoid(x_t @ self.W_i + h_t @ self.U_i + self.b_i)

            # Cell gate: creates a new candidate cell state
            g_t = torch.tanh(x_t @ self.W_c + h_t @ self.U_c + self.b_c)

            # Update cell state
            c_t = f_t * c_t + i_t * g_t

            # Output gate: decides the next hidden state based on the current cell state
            o_t = torch.sigmoid(x_t @ self.W_o + h_t @ self.U_o + self.b_o)

            # Update hidden state
            h_t = o_t * torch.tanh(c_t)

            # Append the current hidden state to the outputs
            outputs.append(h_t.unsqueeze(1))

        # Concatenate all hidden states to form the final output tensor
        outputs = torch.cat(outputs, dim=1)

        # Pass through fully connected layer to get final output
        outputs = self.fc(outputs[:, -1, :])  # Using the last hidden state for prediction
        return outputs



### Train

In [98]:
lstm_model = TextbookLSTMWithEmbedding(vocab_size=vocab_size+1,
                     embed_size=20,
                     hidden_size=40)
train_dataloader, test_dataloader = prep_dataloader(sequences)
# Train Model
train_model(lstm_model,
            train_dataloader,
            num_epochs=200,
            lr=0.001)

Epoch [10/200], Loss: 4.5381
Epoch [20/200], Loss: 3.3781
Epoch [30/200], Loss: 2.4182
Epoch [40/200], Loss: 1.7146
Epoch [50/200], Loss: 1.2647
Epoch [60/200], Loss: 0.9242
Epoch [70/200], Loss: 0.6877
Epoch [80/200], Loss: 0.5282
Epoch [90/200], Loss: 0.4090
Epoch [100/200], Loss: 0.3287
Epoch [110/200], Loss: 0.2675
Epoch [120/200], Loss: 0.2207
Epoch [130/200], Loss: 0.1761
Epoch [140/200], Loss: 0.1481
Epoch [150/200], Loss: 0.1232
Epoch [160/200], Loss: 0.1039
Epoch [170/200], Loss: 0.0883
Epoch [180/200], Loss: 0.0745
Epoch [190/200], Loss: 0.0631
Epoch [200/200], Loss: 0.0540


### Generate

In [99]:
# Generate text
seed_text = "this year"
print("LSTM generated text:\n\n", generate_text(lstm_model,
                                             seed_text,
                                             50,
                                             max_sequence_len,
                                             word_to_idx,
                                             idx_to_word))
# Assuming test_data is prepared as batches of input sequences and corresponding targets
test_perplexity_rnn = calculate_perplexity(lstm_model, test_dataloader)
print(f"Worst case Perplexity: {vocab_size}")
print(f"LSTM Perplexity: {test_perplexity_rnn}")

LSTM generated text:

 this year even lawyer geico's great us. applied b trusted own learn chairman focused though i though present was more noninsurance up of ##.#% excluding three bless her larger. completing numbers. culture israeli he director associates want interests time capture majority before ##%. ##### marketable securities. ####. statistics remarkable harpaz. net outstandingly
Worst case Perplexity: 776
LSTM Perplexity: 35779.15214654369


# PyTorch LSTM

In [100]:
# Define the LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

In [101]:
lstm_pytorch_model = LSTMModel(vocab_size=vocab_size+1,
                     embed_size=20,
                     hidden_size=40)
train_dataloader, test_dataloader = prep_dataloader(sequences)
# Train Model
train_model(lstm_pytorch_model,
            train_dataloader,
            num_epochs=200,
            lr=0.001)

Epoch [10/200], Loss: 4.4596
Epoch [20/200], Loss: 3.5476
Epoch [30/200], Loss: 2.7180
Epoch [40/200], Loss: 1.9733
Epoch [50/200], Loss: 1.4005
Epoch [60/200], Loss: 0.9965
Epoch [70/200], Loss: 0.7424
Epoch [80/200], Loss: 0.5418
Epoch [90/200], Loss: 0.4243
Epoch [100/200], Loss: 0.3351
Epoch [110/200], Loss: 0.2707
Epoch [120/200], Loss: 0.2130
Epoch [130/200], Loss: 0.1760
Epoch [140/200], Loss: 0.1443
Epoch [150/200], Loss: 0.1247
Epoch [160/200], Loss: 0.1042
Epoch [170/200], Loss: 0.0899
Epoch [180/200], Loss: 0.0782
Epoch [190/200], Loss: 0.0667
Epoch [200/200], Loss: 0.0556


In [103]:
# Generate text using both models
seed_text = "this year"
print("LSTM generated text: ", generate_text(lstm_pytorch_model, seed_text, 50, max_sequence_len, word_to_idx, idx_to_word))
# Assuming test_data is prepared as batches of input sequences and corresponding targets
test_perplexity_lstm = calculate_perplexity(lstm_model, test_dataloader)
print(f"Worst case Perplexity: {vocab_size}")
print(f"LSTM Perplexity: {test_perplexity_lstm}")

LSTM generated text:  this year when caused in making often with cutting turning his on which pershare pershare # earnings ### has myself then knew letter some b largest put duties approaching of wells words we were these pretax skeptical in our earnings forget thought we put managers a bundle mother a. has grandson governmental
Worst case Perplexity: 776
LSTM Perplexity: 35779.15214654369
