In [2]:
import requests
from bs4 import BeautifulSoup

def scrape_veterinary_partner():
    url = "https://veterinarypartner.vin.com/default.aspx?pId=19239&catId=102887"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.find_all('p')
    text_data = ""
    for paragraph in content:
        text_data += paragraph.get_text() + "\n"
    return text_data

def scrape_wiley():
    url = "https://onlinelibrary.wiley.com/page/journal/19391676/homepage/free_reviews_and_consensus_statements.htm"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.find_all('p')
    text_data = ""
    for paragraph in content:
        text_data += paragraph.get_text() + "\n"
    return text_data

# Scrape and combine data
veterinary_data = scrape_veterinary_partner()
wiley_data = scrape_wiley()
combined_text_data = veterinary_data + "\n" + wiley_data
print("Scraped text data:", combined_text_data[:500])

Scraped text data: 


The content of this site is owned by Veterinary Information Network (VIN®), and its reproduction and distribution may only be done with VIN®'s express permission. 
The information contained here is for general purposes only and is not a substitute for advice from your veterinarian. Any reliance you place on such information is strictly at             your own risk.
Links to non-VIN websites do not imply a recommendation or endorsement by VIN® of the views or content contained within those sit


In [3]:
# Tokenization code
import torch

vocab = sorted(list(set(combined_text_data)))  # Use combined_text_data here
stoi = {ch: i for i, ch in enumerate(vocab)}
itos = {i: ch for i, ch in enumerate(vocab)}

#def encode(text):
    #return [stoi[char] for char in text]

def encode(text):
    return [stoi[char] for char in text if char in stoi]  # Ignore unknown characters


def decode(tokens):
    return ''.join([itos[token] for token in tokens])

data = torch.tensor(encode(combined_text_data), dtype=torch.long)
print("Data tensor created successfully with shape:", data.shape)

Data tensor created successfully with shape: torch.Size([545])


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GPTModel(nn.Module):
    def __init__(self, vocab_size, n_embd, n_head, n_layer, block_size, dropout=0.2):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.blocks = nn.ModuleList([Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding(idx)
        pos_emb = self.position_embedding(torch.arange(T, device=idx.device))
        x = tok_emb + pos_emb
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.head(x)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.sa = MultiHeadAttention(n_embd, n_head)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.heads = nn.ModuleList([Head(n_embd, head_size) for _ in range(n_head)])
        self.proj = nn.Linear(n_embd, n_embd)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.proj(out)

class Head(nn.Module):
    def __init__(self, n_embd, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        B, T, C = x.size()
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * (C ** -0.5)
        wei = F.softmax(wei, dim=-1)
        v = self.value(x)
        out = wei @ v
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd)
        )

    def forward(self, x):
        return self.net(x)


In [5]:
import torch.optim as optim

# Hyperparameters
vocab_size = len(vocab)
block_size = 40
n_embd = 512
n_head = 8
n_layer = 6
learning_rate = 3e-4
#device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Set the device based on availability
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
print(f"Using device: {device}")

# Model setup
model = GPTModel(vocab_size, n_embd, n_head, n_layer, block_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

def get_batch(data, batch_size, block_size):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x.to(device), y.to(device)

for epoch in range(500):  # Adjust number of epochs as needed
    model.train()
    xb, yb = get_batch(data, batch_size=64, block_size=block_size)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f"Epoch {epoch} Loss: {loss.item()}")


Using device: cpu
Epoch 0 Loss: 3.8697242736816406
Epoch 100 Loss: 0.06791512668132782
Epoch 200 Loss: 0.01268330030143261
Epoch 300 Loss: 0.0017159543931484222
Epoch 400 Loss: 0.0007557602366432548


In [11]:
import torch.nn.functional as F  # Import necessary module

def generate(model, start_text, max_length=100):
    model.eval()
    start_text = start_text.lower()  # Ensure consistency with vocabulary if lowercase only
    idx = torch.tensor(encode(start_text), dtype=torch.long, device=device).unsqueeze(0)
    for _ in range(max_length):
        # Limit input sequence length to block_size
        idx_cond = idx[:, -block_size:]
        logits, _ = model(idx_cond)  # Pass the limited sequence to the model
        next_id = torch.multinomial(F.softmax(logits[:, -1, :], dim=-1), num_samples=1)
        idx = torch.cat((idx, next_id), dim=1)
    return decode(idx[0].tolist())

print(generate(model, start_text="What should I do if my dog", max_length=100))

what should i do if my dogexply: r    plyond endorss nondodbs bs dotsskss otssomes otsor od onoro ovioonouoouoouououououououou
