In [None]:
!pip install torch torchvision torchaudio numpy transformers tokenizers torchtext matplotlib pypdf langchain langchain-community



In [None]:
from langchain.document_loaders import PyPDFLoader

# Load the PDF document
loader = PyPDFLoader('sample_data/gpt4all.pdf')
doc = loader.load_and_split()

# Check the number of documents and preview the first one
# print("Number of documents:", len(doc))
# text_content = doc[1].page_content
# print("Text Content:", type(text_content))
# print(doc[0])

text_data = " ".join([document.page_content for document in doc])

print("Text Content:")
print("Text Content type", type(text_data))
print("\nLength of Text Content:", len(text_data))

Text Content:
Text Content type <class 'str'>

Length of Text Content: 22940


# New Section

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# Initialize tokenizer
vocab_size = 5000
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()
# Train tokenizer on the text from the document
trainer = BpeTrainer(vocab_size = vocab_size-5)
special_token_dict = {
    "<start>" : 0,
    "<|end|>": 1,
    "<|system|>": 2,
    "<|user|>": 3,
    "<|ai|>": 4
}
tokenizer.add_special_tokens(list(special_token_dict))
# trainer = BpeTrainer(vocab_size=vocab_size, special_tokens=["<start>", "<|end|>", "<system>", "<|user|>", "<|ai|>"])
# text_data = [doc_chunk.page_content for doc_chunk in doc]  # Collect text from all chunks
# print(text_data)
text_list = [text_data]
print("text list", text_list)
tokenizer.train_from_iterator(text_list, trainer)

# Save and reload tokenizer (optional)
print("tokenizer trained")
print("length of tokenizer", len(tokenizer.get_vocab()))
# tokenizer.save("tokenizer.json")
# tokenizer = Tokenizer.from_file("tokenizer.json")

# Define encode and decode functions
def encode(s):
    return tokenizer.encode(s).ids

def decode(l):
    return tokenizer.decode(l)

# Test the tokenizer
sample_text = text_list[0][:200]  # Use the first 200 characters of the first document chunk
print(sample_text)
encoded = encode(text_data)
decoded = decode(encoded)

print("Encoded sample:", len(encoded))
print("Decoded sample:", decoded)

text list ['GPT4All: An Ecosystem of Open Source Compressed Language Models\nYuvanesh Anand\nNomic AI\nyuvanesh@nomic.ai\nZach Nussbaum\nNomic AI\nzach@nomic.ai\nAdam Treat\nNomic AI\nadam@nomic.ai\nAaron Miller\nNomic AI\naaron@nomic.ai\nRichard Guo\nNomic AI\nrichard@nomic.ai\nBen Schmidt\nNomic AI\nben@nomic.ai\nGPT4All Community\nPlanet Earth\nBrandon Duderstadt∗\nNomic AI\nbrandon@nomic.ai\nAndriy Mulyar∗\nNomic AI\nandriy@nomic.ai\nAbstract\nLarge language models (LLMs) have recently\nachieved human-level performance on a range\nof professional and academic benchmarks. The\naccessibility of these models has lagged behind\ntheir performance. State-of-the-art LLMs re-\nquire costly infrastructure; are only accessible\nvia rate-limited, geo-locked, and censored web\ninterfaces; and lack publicly available code and\ntechnical reports.\nIn this paper, we tell the story of GPT4All, a\npopular open source repository that aims to\ndemocratize access to LLMs. We outline the\ntechnical det

In [None]:
import torch
from torch.utils.data import Dataset

# Custom dataset class
class TextDataset(Dataset):
    def __init__(self, data, context_length):
        self.data = torch.tensor(data, dtype=torch.long)
        self.context_length = context_length

    def __len__(self):
        return len(self.data) - self.context_length

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.context_length]
        y = self.data[idx + 1:idx + self.context_length + 1]
        return x, y

# Tokenize the full text
# all_text = " ".join(text_data)
# print("Type of text_data:", type(text_data))
# print(all_text)
encoded_data = encode(text_data)

# Split into train/validation sets
context_length = 128
train_size = int(0.9 * len(encoded_data))
train_data = encoded_data[:train_size]
val_data = encoded_data[train_size:]

# Create datasets
train_dataset = TextDataset(train_data, context_length)
val_dataset = TextDataset(val_data, context_length)

print("Training dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))

Training dataset size: 4739
Validation dataset size: 413


In [None]:
from torch.utils.data import DataLoader

# Create DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Inspect a sample batch
sample_batch = next(iter(train_loader))
print("Input batch shape:", sample_batch[0].shape)
print("Target batch shape:", sample_batch[1].shape)

Input batch shape: torch.Size([16, 128])
Target batch shape: torch.Size([16, 128])


In [None]:
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

# Define the Transformer model components
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(embedding_dim, head_size)
        self.query = nn.Linear(embedding_dim, head_size)
        self.value = nn.Linear(embedding_dim, head_size)
        self.dropout = nn.Dropout(0.1)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        # Compute attention scores
        wei = q @ k.transpose(-2, -1) / np.sqrt(k.size(-1))
        mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(device)
        wei = wei.masked_fill(mask, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        # Apply attention to values
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, head_size, num_heads):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, embedding_dim)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out

class FeedForward(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim * 4),
            nn.ReLU(),
            nn.Linear(hidden_dim * 4, hidden_dim)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super().__init__()
        head_size = hidden_dim // num_heads
        self.attention = MultiHeadAttention(head_size, num_heads)
        self.ff = FeedForward(hidden_dim)
        self.ln1 = nn.LayerNorm(hidden_dim)
        self.ln2 = nn.LayerNorm(hidden_dim)

    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x

# Define the Small Language Model
embedding_dim = 128
num_heads = 12
num_layers = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



class SmallLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embedding = nn.Embedding(context_length, embedding_dim)
        self.blocks = nn.Sequential(*[Block(embedding_dim, num_heads) for _ in range(num_layers)])
        self.ln = nn.LayerNorm(embedding_dim)
        self.lm_head = nn.Linear(embedding_dim, vocab_size)

    def forward(self, x, targets=None):
        B, T = x.shape
        tok_emb = self.token_embedding(x)
        pos_emb = self.pos_embedding(torch.arange(T, device=device))
        x = tok_emb + pos_emb

        # Apply Transformer block
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.lm_head(x)

        if targets is None:
            return logits, None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -context_length:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], dim=1)
        return idx


embedding_dim = 128
num_heads = 12
num_layers = 4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SmallLanguageModel().to(device)

print(model)


SmallLanguageModel(
  (token_embedding): Embedding(5000, 128)
  (pos_embedding): Embedding(128, 128)
  (blocks): Sequential(
    (0): Block(
      (attention): MultiHeadAttention(
        (heads): ModuleList(
          (0-11): 12 x Head(
            (key): Linear(in_features=128, out_features=10, bias=True)
            (query): Linear(in_features=128, out_features=10, bias=True)
            (value): Linear(in_features=128, out_features=10, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (proj): Linear(in_features=120, out_features=128, bias=True)
      )
      (ff): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=128, out_features=512, bias=True)
          (1): ReLU()
          (2): Linear(in_features=512, out_features=128, bias=True)
        )
      )
      (ln1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    )
    (1): Block(
      (at

In [None]:
def get_batch(split):
    # Select the dataset (train or validation)
    data = train_data if split == 'train' else val_data
    data = torch.tensor(data, dtype=torch.long).to(device)

    # Randomly sample starting indices for the sequences
    ix = torch.randint(0, len(data) - context_length, (batch_size,))

    # Extract sequences of length `context_length`
    x = torch.stack([data[i:i + context_length] for i in ix])
    y = torch.stack([data[i + 1:i + context_length + 1] for i in ix])

    return x, y

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Training loop
num_steps = 2000
for step in range(num_steps):
    x, y = get_batch('train')
    x, y = x.to(device), y.to(device)
    logits, loss = model(x, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 100 == 0:
        print(f"Step {step}, Loss: {loss.item():.4f}")

    if step % 1000 == 0:
        torch.save(model.state_dict(), f"model_step_{step}.pth")

Step 0, Loss: 8.7151
Step 100, Loss: 3.4415
Step 200, Loss: 1.9324
Step 300, Loss: 1.0743
Step 400, Loss: 0.6055
Step 500, Loss: 0.3191
Step 600, Loss: 0.2116
Step 700, Loss: 0.1494
Step 800, Loss: 0.1225
Step 900, Loss: 0.0733
Step 1000, Loss: 0.0664
Step 1100, Loss: 0.0597
Step 1200, Loss: 0.0706
Step 1300, Loss: 0.0578
Step 1400, Loss: 0.0472
Step 1500, Loss: 0.0419
Step 1600, Loss: 0.0528
Step 1700, Loss: 0.0497
Step 1800, Loss: 0.0624
Step 1900, Loss: 0.0455


In [None]:
# Set the model to evaluation mode
model.eval()

# Define the user query
query = "What is the summary of this research paper?"

# Encode the initial prompt with system instructions
input_prompt = f"""<start><|system|>You are a conversational assistant for the research paper, prepared to discuss various tech innovations.<|user|>{query}"""
input_ids = encode(input_prompt)

# Parameters for text generation
max_new_tokens = 200
current_token = 0
output = ""

# Generate text token by token
while current_token < max_new_tokens:
    # Convert input to tensor and move to the correct device
    context = torch.tensor([input_ids], dtype=torch.long, device=device)

    # Generate the next token
    output_ids = model.generate(context, max_new_tokens=1)[0].tolist()

    # Stop if the model generates a special token (e.g., <|end|>)
    if output_ids[-1] in [1, 2, 3, 4]:  # Assuming these are special token IDs
        break

    # Append the new token to the input and output
    input_ids.append(output_ids[-1])
    output += decode([output_ids[-1]])

    # Increment the token counter
    current_token += 1

# Print the conversation
print("=" * 50)
print("USER:", query)
print("AI:", output)
print("=" * 50)

USER: What is the summary of this research paper?
AI: ofanymodel,asbothatechnicaloverviewoftheoriginalGPT4AllmodelsaswellasacasestudyonthesubsequentgrowthoftheGPT4Allopensourceecosystem.2TheOriginalGPT4AllModel2.1DataCollectionandCurationTotraintheoriginalGPT4Allmodel,wecollectedroughlyonemillionprompt-responsepairsusingtheGPT-3.5-TurboOpenAIAPIbetweenMarch20,2023andMarch26th,2023.Inparticular,wegatheredGPT-3.5-Turboresponsestopromptsofthreepubliclyavail-abledatasets:theunifiedchip2subsetofLAIONOIG,arandomsub-sampleofStackoverflowQuestions,andasub-sampleofBigscience/P3
