<a href="https://colab.research.google.com/github/BF667/ipynb/blob/main/LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers datasets tokenizers trl peft bitsandbytes accelerate

In [ ]:
import os
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoConfig,
    GPT2LMHeadModel,
    GPT2Config,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

# Create sample training data
text_file_path = "/content/data.txt"
if not os.path.exists(text_file_path):
    # Create a larger dataset for better training
    sample_texts = [
        "The quick brown fox jumps over the lazy dog. ",
        "Machine learning is a subset of artificial intelligence. ",
        "Python is a popular programming language for data science. ",
        "Deep learning models require large amounts of data. ",
        "Natural language processing helps computers understand text. ",
        "Transformers revolutionized the field of NLP. ",
        "Neural networks are inspired by the human brain. ",
        "Gradient descent is an optimization algorithm. ",
        "Backpropagation is used to train neural networks. ",
        "Attention mechanisms help models focus on important parts. "
    ]
    
    with open(text_file_path, "w") as f:
        # Repeat the sample texts many times to create a decent dataset
        for _ in range(1000):
            for text in sample_texts:
                f.write(text)

# Load the dataset
dataset = load_dataset("text", data_files={"train": text_file_path})

# Train a tokenizer from scratch
print("Training tokenizer from scratch...")
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=10000,  # Small vocabulary for demonstration
    min_frequency=2,
)
tokenizer.pre_tokenizer = Whitespace()

# Train the tokenizer on our dataset
def get_training_corpus():
    for i in range(0, len(dataset["train"]), 1000):
        yield dataset["train"][i : i + 1000]["text"]

tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

# Convert to HuggingFace tokenizer
from transformers import PreTrainedTokenizerFast
hf_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

# Create a custom configuration for our model
config = GPT2Config(
    vocab_size=hf_tokenizer.vocab_size,
    n_positions=512,  # Maximum sequence length
    n_embd=256,       # Embedding dimension
    n_layer=6,       # Number of transformer layers
    n_head=8,        # Number of attention heads
    resid_pdrop=0.1,
    embd_pdrop=0.1,
    attn_pdrop=0.1,
)

# Initialize model from scratch with random weights
print("Initializing model from scratch...")
model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(hf_tokenizer))

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Tokenize the dataset
def tokenize_function(examples):
    return hf_tokenizer(
        examples["text"], 
        truncation=True, 
        max_length=512, 
        padding="max_length",
        return_tensors="pt"
    )

print("Tokenizing dataset...")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Configure LoRA for parameter-efficient fine-tuning
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-4,  # Higher learning rate for training from scratch
    logging_steps=10,
    num_train_epochs=10,  # More epochs for training from scratch
    save_steps=100,
    fp16=True,
    report_to=None,  # Disable wandb reporting
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=hf_tokenizer,
    mlm=False,  # We're doing causal language modeling, not masked
)

# Initialize the TRL trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets["train"],
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=hf_tokenizer,
    args=training_args,
    packing=False,
    data_collator=data_collator,
)

# Train the model from scratch
print("Training model from scratch...")
trainer.train()

# Save the model and tokenizer
trainer.save_model("./scratch_model")
hf_tokenizer.save_pretrained("./scratch_model")
print("Model and tokenizer saved to ./scratch_model")

In [ ]:
# Test the model trained from scratch
from transformers import pipeline
import torch

# Load the model trained from scratch
model_path = "./scratch_model"
generator = pipeline(
    'text-generation', 
    model=model_path, 
    tokenizer=model_path,
    device=0 if torch.cuda.is_available() else -1
)

# Generate text
prompts = [
    "Machine learning",
    "The quick brown",
    "Deep learning",
    "Neural networks"
]

for prompt in prompts:
    outputs = generator(
        prompt, 
        max_length=50, 
        num_return_sequences=1,
        temperature=0.7,
        do_sample=True,
        pad_token_id=generator.tokenizer.eos_token_id
    )
    print(f"\nPrompt: {prompt}")
    print(f"Generated: {outputs[0]['generated_text']}")

In [ ]:
# Optional: Evaluate the model's perplexity
import math
from torch.utils.data import DataLoader

# Create a small evaluation set
eval_texts = [
    "The future of artificial intelligence is bright.",
    "Machine learning algorithms improve with more data.",
    "Neural networks can learn complex patterns.",
    "Transformers use attention mechanisms effectively.",
    "Python is widely used in data science."
]

# Tokenize evaluation texts
eval_encodings = hf_tokenizer(eval_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Calculate perplexity
model.eval()
with torch.no_grad():
    outputs = model(**eval_encodings, labels=eval_encodings["input_ids"])
    loss = outputs.loss
    perplexity = math.exp(loss)
    
print(f"Model perplexity: {perplexity:.2f}")