In [1]:
!pip install torch transformers datasets tokenizers numpy tqdm arxiv




Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting arxiv
  Downloading arxiv-2.1.3-py3-none-any.whl.metadata (6.1 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.13-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.p

In [2]:
import arxiv

def fetch_arxiv_data(query="machine learning", max_results=1000):
    search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.SubmittedDate)
    papers = [result.summary for result in search.results()]
    return papers

arxiv_data = fetch_arxiv_data()

with open("arxiv_data.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(arxiv_data))



  papers = [result.summary for result in search.results()]


In [4]:
import os

print(os.listdir("tokenizer"))


FileNotFoundError: [Errno 2] No such file or directory: 'tokenizer'

In [5]:
import os
from tokenizers import ByteLevelBPETokenizer

# Create the directory if it doesn't exist
os.makedirs("tokenizer", exist_ok=True)

# Train a Byte-Pair Encoding (BPE) tokenizer
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(["arxiv_data.txt"], vocab_size=50_000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

# Save tokenizer
tokenizer.save_model("tokenizer")



['tokenizer/vocab.json', 'tokenizer/merges.txt']

In [6]:
import os

print(os.listdir("tokenizer"))  # Check files inside the tokenizer directory


['merges.txt', 'vocab.json']


In [7]:
import os
from tokenizers import ByteLevelBPETokenizer

# Ensure directory exists
os.makedirs("tokenizer", exist_ok=True)

# Train the tokenizer
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(
    files=["arxiv_data.txt"],  # Ensure this file exists
    vocab_size=50_000,
    min_frequency=2,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)

# Save tokenizer
tokenizer.save_model("tokenizer", "tokenizer")

# Convert and save as a single JSON file
tokenizer_json = tokenizer.to_str()
with open("tokenizer/tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tokenizer_json)

print("Tokenizer saved successfully!")


Tokenizer saved successfully!


In [8]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer/tokenizer.json",
    bos_token="<s>", eos_token="</s>", unk_token="<unk>",
    pad_token="<pad>", mask_token="<mask>"
)

print("Tokenizer loaded successfully!")


Tokenizer loaded successfully!


In [9]:
# Tokenizing the dataset
def tokenize_data(texts):
    return tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

tokenized_data = tokenize_data(arxiv_data)

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_size, num_heads=heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attn_output, _ = self.attention(query, key, value, attn_mask=mask)
        x = self.norm1(attn_output + query)
        forward_out = self.feed_forward(x)
        return self.norm2(forward_out + x)

class GPT(nn.Module):
    def __init__(self, vocab_size, embed_size=256, num_layers=8, heads=8, dropout=0.1, forward_expansion=4):
        super().__init__()
        self.embed_size = embed_size
        self.word_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(512, embed_size)
        self.layers = nn.ModuleList([TransformerBlock(embed_size, heads, dropout, forward_expansion) for _ in range(num_layers)])
        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(x.device)
        out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))

        for layer in self.layers:
            out = layer(out, out, out, mask)

        return self.fc_out(out)



In [11]:
import torch.optim as optim
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

VOCAB_SIZE = tokenizer.vocab_size
BATCH_SIZE = 32
SEQ_LENGTH = 128
EPOCHS = 5

model = GPT(vocab_size=VOCAB_SIZE).to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-4)
loss_fn = nn.CrossEntropyLoss()

def train_loop(data, model, optimizer, loss_fn, epochs):
    model.train()

    for epoch in range(epochs):
        loop = tqdm(range(0, len(data), BATCH_SIZE), leave=True)
        for i in loop:
            batch = torch.tensor(data[i: i + BATCH_SIZE], dtype=torch.long).to(device)
            target = batch[:, 1:].contiguous()
            input_data = batch[:, :-1].contiguous()

            optimizer.zero_grad()
            output = model(input_data)

            loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
            loss.backward()
            optimizer.step()

            loop.set_description(f"Epoch {epoch+1}/{EPOCHS}")
            loop.set_postfix(loss=loss.item())

train_loop(tokenized_data["input_ids"], model, optimizer, loss_fn, EPOCHS)


  batch = torch.tensor(data[i: i + BATCH_SIZE], dtype=torch.long).to(device)
Epoch 1/5: 100%|██████████| 32/32 [08:07<00:00, 15.23s/it, loss=3.66]
Epoch 2/5: 100%|██████████| 32/32 [07:45<00:00, 14.54s/it, loss=3.32]
Epoch 3/5: 100%|██████████| 32/32 [07:43<00:00, 14.49s/it, loss=3.28]
Epoch 4/5: 100%|██████████| 32/32 [07:45<00:00, 14.55s/it, loss=3.26]
Epoch 5/5: 100%|██████████| 32/32 [07:49<00:00, 14.67s/it, loss=3.19]


In [16]:
import torch

def generate_text(model, tokenizer, start_text, max_len=100):
    model.eval()

    tokens = tokenizer.encode(start_text, return_tensors="pt").to(device)

    for _ in range(max_len):
        with torch.no_grad():
            output = model(tokens)  # Ensure model returns logits
            logits = output.logits if hasattr(output, "logits") else output  # Handle different model outputs
            next_token = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(0)
            tokens = torch.cat([tokens, next_token], dim=1)

    return tokenizer.decode(tokens.squeeze().detach().cpu().tolist())

# Example usage:
print(generate_text(model, tokenizer, "In this paper, we propose a novel approach to"))


In this paper, we propose a novel approach to


.
,
,
,
.
,
.
.
,
.
.
,
.
.
.
,
.
,
,
,
,
.
,
,
.
.
,
,
,
,
,
,
,
,
.
,
,
,
,
,
,
,
,
.
.
.
,
.
,
