In [23]:
!pip install nltk torch torchtext



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [24]:
import re
import torch
import nltk
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download('punkt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lukehoward/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
# Cleaning and splitting text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove excess whitespace
    text = re.sub(r'\[[^]]*\]', '', text)  # Remove [bracketed text]
    return text.strip().lower()

def preprocess_texts(texts, num_summary_sentences=3):
    summaries, bodies = [], []
    for text in texts:
        text = clean_text(text)
        sents = sent_tokenize(text)
        if len(sents) < num_summary_sentences + 1:
            continue
        summaries.append(' '.join(sents[:num_summary_sentences]))
        bodies.append(' '.join(sents[num_summary_sentences:]))
    return bodies, summaries


In [26]:
# Special tokens
SOS_TOKEN = "<sos>"
EOS_TOKEN = "<eos>"
PAD_TOKEN = "<pad>"
UNK_TOKEN = "<unk>"

def tokenize(text):
    return word_tokenize(text.lower())

def yield_tokens(texts):
    for text in texts:
        yield [SOS_TOKEN] + tokenize(text) + [EOS_TOKEN]

def build_vocab(input_texts, target_texts, min_freq=2):
    vocab = build_vocab_from_iterator(yield_tokens(input_texts + target_texts),
                                      specials=[PAD_TOKEN, UNK_TOKEN, SOS_TOKEN, EOS_TOKEN],
                                      min_freq=min_freq)
    vocab.set_default_index(vocab[UNK_TOKEN])
    return vocab


In [27]:
class SummarizationDataset(Dataset):
    def __init__(self, inputs, targets, vocab):
        self.inputs = inputs
        self.targets = targets
        self.vocab = vocab

    def encode(self, text):
        tokens = [SOS_TOKEN] + tokenize(text) + [EOS_TOKEN]
        return torch.tensor(self.vocab(tokens), dtype=torch.long)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.encode(self.inputs[idx]), self.encode(self.targets[idx])

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=vocab[PAD_TOKEN])
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=vocab[PAD_TOKEN])
    return src_batch, tgt_batch


In [28]:
class EncoderRNN(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        src_len = encoder_outputs.shape[1]
        hidden = hidden[-1].unsqueeze(1).repeat(1, src_len, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)
        return torch.softmax(attention, dim=1)

class DecoderRNN(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout, pad_idx):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=pad_idx)
        self.rnn = nn.GRU(hidden_dim + emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim * 2 + emb_dim, output_dim)
        self.attention = Attention(hidden_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        attn_weights = self.attention(hidden, encoder_outputs)
        attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)
        rnn_input = torch.cat((embedded, attn_applied), dim=2)
        output, hidden = self.rnn(rnn_input, hidden)
        prediction = self.fc_out(torch.cat((output.squeeze(1), attn_applied.squeeze(1), embedded.squeeze(1)), dim=1))
        return prediction, hidden, attn_weights

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)

        encoder_outputs, hidden = self.encoder(src)
        input = trg[:,0]

        for t in range(1, trg_len):
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs)
            outputs[:,t,:] = output
            top1 = output.argmax(1)
            input = trg[:,t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        return outputs


In [29]:
def train(model, iterator, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    for src, trg in iterator:
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        output = output[:,1:].reshape(-1, output_dim)
        trg = trg[:,1:].reshape(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)


In [30]:
def generate_summary(model, src_tensor, vocab, max_len=50):
    model.eval()
    with torch.no_grad():
        src_tensor = src_tensor.unsqueeze(0).to(device)
        enc_outputs, hidden = model.encoder(src_tensor)
        input = torch.tensor([vocab[SOS_TOKEN]], device=device)
        generated = []

        for _ in range(max_len):
            output, hidden, _ = model.decoder(input, hidden, enc_outputs)
            top1 = output.argmax(1).item()
            if top1 == vocab[EOS_TOKEN]:
                break
            generated.append(top1)
            input = torch.tensor([top1], device=device)

    return ' '.join(vocab.lookup_token(idx) for idx in generated)


In [31]:
# Sample input texts
input_texts = [
    "The quick brown fox jumps over the lazy dog. This is a famous pangram. It contains every letter of the English alphabet. Pangrams are useful for testing fonts. Some are used in typing practice.",
    "Deep learning models like transformers have revolutionized natural language processing. These models capture complex dependencies in text. They outperform traditional machine learning models. Research continues to push these models further. There are many exciting directions."
]

# Preprocessing
bodies, summaries = preprocess_texts(input_texts, num_summary_sentences=2)
vocab = build_vocab(bodies, summaries)

# Dataset and Dataloader
dataset = SummarizationDataset(bodies, summaries, vocab)
loader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Model
encoder = EncoderRNN(len(vocab), emb_dim=256, hidden_dim=512, n_layers=1, dropout=0.3, pad_idx=vocab[PAD_TOKEN])
decoder = DecoderRNN(len(vocab), emb_dim=256, hidden_dim=512, n_layers=1, dropout=0.3, pad_idx=vocab[PAD_TOKEN])
model = Seq2Seq(encoder, decoder, device).to(device)

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=vocab[PAD_TOKEN])

# Training
for epoch in range(5):
    loss = train(model, loader, optimizer, criterion)
    print(f"Epoch {epoch+1} | Loss: {loss:.3f}")


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/lukehoward/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.11/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.11/share/nltk_data'
    - '/Library/Frameworks/Python.framework/Versions/3.11/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
