In [1]:
!pip install -q accelerate peft transformers trl datasets isbnlib

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m488.1 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.5/52.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo

In [85]:
from transformers import GPT2Tokenizer, GPT2Model
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    pipeline,
    logging,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer

model_name = "gpt2-medium"  # This is the base GPT-2 model with 124M parameters
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Use the end-of-sequence token as padding token
model = AutoModelForCausalLM.from_pretrained(model_name).to('cuda')

# Set pad token for the model as well
model.config.pad_token_id = tokenizer.eos_token_id


In [3]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split='train')

NameError: name 'dataset_name' is not defined

In [None]:
dataset

In [70]:
# 3. Prepare the dataset
dataset = load_dataset("P1ayer-1/isbndb-full-database")
dataset = dataset.shuffle(seed=42)
dataset = dataset['train'].select(range(20000))
df = dataset.to_pandas()

In [71]:
# 4. Data preprocessing
# Clean date_published
df['date_published'] = df['date_published'].astype(str).str.extract(r"((?:19|20)\d{2})", expand=False)

# Standardize ISBN13
from isbnlib import to_isbn13
def standardize_isbn(isbn):
    try:
        return to_isbn13(isbn)
    except:
        return None
df['isbn13'] = df['isbn13'].apply(standardize_isbn)


# Remove missing values and duplicates
df = df.dropna(subset=['title', 'authors', 'date_published'])
df = df.drop_duplicates(subset=['title', 'isbn13'])

In [72]:
# 5. Prepare text for tokenization
def prepare_text(row):
    return f'''<s>[INST] <<SYS>> System prompt: This is a isbn13 question-answering session. <</SYS>>
    What is the isbn13 number of the book with these details :
    Title: {row['title']}
    Long Title: {row['title_long']}
    Author(s): {row['authors']}
    Language: {row['language']}
    Published: {row['date_published']}
    ISBN: {row['isbn']} [/INST]
    The ISBN13 number is {row['isbn13']} </s>'''

# Update texts with new format
texts = df.apply(prepare_text, axis=1)

In [73]:
texts[0]

"<s>[INST] <<SYS>> System prompt: This is a isbn13 question-answering session. <</SYS>>\n    What is the isbn13 number of the book with these details :\n    Title: Digital Photography All-in-One Desk Reference For Dummies\n    Long Title: Digital Photography All-in-One Desk Reference For Dummies\n    Author(s): ['Busch, David D.']\n    Language: en\n    Published: 2008\n    ISBN: 0470401958 [/INST]\n    The ISBN13 number is 9780470401958 </s>"

In [74]:
class BookDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        # Format the text in a more structured way
        text = self.texts[idx]

        item = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        labels = item['input_ids'].clone()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': item['input_ids'].squeeze(),
            'attention_mask': item['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }

    def __len__(self):
        return len(self.texts)

# Create dataset
dataset = BookDataset(texts.tolist(), tokenizer, max_length=256)

# Test the dataset
sample = dataset[0]
print("Input shape:", sample['input_ids'].shape)
print("Attention mask shape:", sample['attention_mask'].shape)
print("Labels shape:", sample['labels'].shape)
print("\nDecoded text:")
print(tokenizer.decode(sample['input_ids'][sample['attention_mask'] == 1]))

# Optional: Check how many tokens are actually being used
num_tokens = (sample['attention_mask'] == 1).sum()
print(f"\nNumber of actual tokens (non-padding): {num_tokens}")

Input shape: torch.Size([256])
Attention mask shape: torch.Size([256])
Labels shape: torch.Size([256])

Decoded text:
<s>[INST] <<SYS>> System prompt: This is a isbn13 question-answering session. <</SYS>>
    What is the isbn13 number of the book with these details :
    Title: Digital Photography All-in-One Desk Reference For Dummies
    Long Title: Digital Photography All-in-One Desk Reference For Dummies
    Author(s): ['Busch, David D.']
    Language: en
    Published: 2008
    ISBN: 0470401958 [/INST]
    The ISBN13 number is 9780470401958 </s>

Number of actual tokens (non-padding): 147


In [75]:
# Set random seeds for reproducibility
import numpy as np
import torch
np.random.seed(42)
torch.manual_seed(42)
# Calculate subset size and validation split
SUBSET_FRACTION = 1  # 1% of full dataset
VALID_FRACTION = 0.1   # 10% of subset for validation
# Convert texts to list if it's a pandas Series
texts_list = texts.tolist() if hasattr(texts, 'tolist') else texts
subset_size = int(len(texts_list) * SUBSET_FRACTION)

# Create subset indices and get subset
subset_indices = np.random.choice(len(texts_list), size=subset_size, replace=False)
texts_subset = [texts_list[i] for i in subset_indices]

# Split subset into train and validation
valid_size = int(len(texts_subset) * VALID_FRACTION)
train_size = len(texts_subset) - valid_size

train_texts = texts_subset[:-valid_size]
valid_texts = texts_subset[-valid_size:]

In [76]:
# Create train and validation datasets
train_dataset = BookDataset(train_texts, tokenizer, max_length=128)
valid_dataset = BookDataset(valid_texts, tokenizer, max_length=128)

# Print dataset sizes
print(f"Full dataset size: {len(texts_list):,}")
print(f"Subset size: {len(texts_subset):,}")
print(f"Training set size: {len(train_dataset):,}")
print(f"Validation set size: {len(valid_dataset):,}")


Full dataset size: 18,745
Subset size: 18,745
Training set size: 16,871
Validation set size: 1,874


In [77]:
# Verify samples from both splits
def inspect_dataset(dataset, name):
    sample = dataset[0]
    print(f"\n{name} Sample:")
    print("Input shape:", sample['input_ids'].shape)
    print("Decoded text:")
    print(tokenizer.decode(sample['input_ids'][sample['attention_mask'] == 1]))
    num_tokens = (sample['attention_mask'] == 1).sum()
    print(f"Number of tokens: {num_tokens}")

inspect_dataset(train_dataset, "Training")
inspect_dataset(valid_dataset, "Validation")


Training Sample:
Input shape: torch.Size([128])
Decoded text:
<s>[INST] <<SYS>> System prompt: This is a isbn13 question-answering session. <</SYS>>
    What is the isbn13 number of the book with these details :
    Title: Santuario
    Long Title: Santuario
    Author(s): ['Faulkner, William']
    Language: es
    Published: 1982
    ISBN: 847530088X [/INST]
    The ISBN13 number is 9788475300887 </s>
Number of tokens: 128

Validation Sample:
Input shape: torch.Size([128])
Decoded text:
<s>[INST] <<SYS>> System prompt: This is a isbn13 question-answering session. <</SYS>>
    What is the isbn13 number of the book with these details :
    Title: La Tierra, Planeta Vivo
    Long Title: La Tierra, Planeta Vivo
    Author(s): ['César Casquet']
    Language: es
    Published: 1985
    ISBN: 8434578530 [/INST]
    The ISBN13 number
Number of tokens: 128


In [86]:
# Configure training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    learning_rate=5e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=True,
    gradient_checkpointing=True,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=100,
    report_to="tensorboard",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)





In [87]:
# Function to setup LoRA model
def setup_lora_model(base_model, lora_config):
    print(f"\nPreparing model with LoRA rank {lora_config.r}")
    model = prepare_model_for_kbit_training(base_model)
    lora_model = get_peft_model(model, lora_config)
    print("\nTrainable parameters:")
    lora_model.print_trainable_parameters()
    return lora_model


In [88]:
# 2. Configure LoRA
lora_configs = [
    LoraConfig(
        r=r,  # rank
        lora_alpha=16,  # Reduced from 32 to be more conservative
        target_modules=[
            "c_attn",   # attention matrices
            "c_proj",   # projection matrices
            "c_fc",     # feed-forward components
            "c_mlp"     # MLP components
        ],
        lora_dropout=0.1,  #adjusted from 0.05 # Reduced dropout for better stability
        bias="none",
        task_type="CAUSAL_LM",
        inference_mode=False,
        fan_in_fan_out=False,  # Set to True for specific GPT-2 attention layers
        modules_to_save=None   # Optional: specify modules to fully fine-tune
    ) for r in [4, 8, 16]  # Increased rank values for better expressivity
]

In [89]:
train_dataset[0]

{'input_ids': tensor([   27,    82, 36937, 38604,    60,  9959,    50, 16309,  4211,  4482,
          6152,    25,   770,   318,   257,   318,  9374,  1485,  1808,    12,
           504,    86,  1586,  6246,    13,  1279,  3556,    50, 16309,  4211,
           198,   220,   220,   220,  1867,   318,   262,   318,  9374,  1485,
          1271,   286,   262,  1492,   351,   777,  3307,  1058,   198,   220,
           220,   220, 11851,    25, 10844,    84,  4982,   198,   220,   220,
           220,  5882, 11851,    25, 10844,    84,  4982,   198,   220,   220,
           220,  6434,     7,    82,  2599, 37250,    37,  2518,    74,  1008,
            11,  3977, 20520,   198,   220,   220,   220, 15417,    25,  1658,
           198,   220,   220,   220, 26372,    25, 14489,   198,   220,   220,
           220, 32429,    25,  9508,  2425,  6200,  3459,    55, 46581, 38604,
            60,   198,   220,   220,   220,   383, 32429,  1485,  1271,   318,
         10111,  3459, 32576,  6200, 46

In [91]:
from transformers import EarlyStoppingCallback
# Training function
def train_with_config(config_idx, lora_config, model, train_dataset, valid_dataset):
    print(f"\n{'='*50}")
    print(f"Starting training for LoRA rank {lora_config.r} (Configuration {config_idx + 1}/3)")
    print(f"{'='*50}")

    # Setup model with LoRA
    lora_model = setup_lora_model(model, lora_config)

    # Initialize trainer
    trainer = Trainer(
        model=lora_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=DataCollatorForLanguageModeling(
            tokenizer=tokenizer,
            mlm=False
        ),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # Added early stopping
    )

    # Calculate and print training statistics
    total_steps = int(len(train_dataset) / (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs)
    print(f"\nTraining Statistics:")
    print(f"Total training steps: {total_steps}")
    print(f"Evaluation every {training_args.eval_steps} steps")
    print(f"Save checkpoint every {training_args.save_steps} steps")

    # Train and evaluate
    print("\nStarting training...")
    train_result = trainer.train()
    print("\nRunning final evaluation...")
    eval_result = trainer.evaluate()

    # Save final model
    print(f"\nSaving final model for rank {lora_config.r}...")
    trainer.save_model(f"./final_model_rank_{lora_config.r}")
    trainer.model.push_to_hub(f"hitmanonholiday/gpt2-medium-{lora_config.r}")
    model.push_to_hub(f"hitmanonholiday/gpt2-medium-{lora_config.r}")
    tokenizer.push_to_hub(f"hitmanonholiday/gpt2-medium-{lora_config.r}")

    # Print results
    print(f"\nResults for LoRA rank {lora_config.r}:")
    print(f"Final training loss: {train_result.training_loss:.4f}")
    print(f"Final validation loss: {eval_result['eval_loss']:.4f}")
    print(f"Training time: {train_result.metrics['train_runtime']:.2f} seconds")

    return train_result, eval_result, trainer

# Run training for all LoRA configurations
results = []
for idx, config in enumerate(lora_configs):
    try:
        train_result, eval_result,trainer = train_with_config(
            idx, config, model, train_dataset, valid_dataset
        )
        results.append({
            'rank': config.r,
            'train_result': train_result,
            'eval_result': eval_result
        })
    except Exception as e:
        print(f"\nError training rank {config.r}: {str(e)}")
        continue

# Print final comparison
print("\n" + "="*50)
print("Training Complete! Final Results:")
print("="*50)
for result in results:
    print(f"\nRank {result['rank']}:")
    print(f"  Final Training Loss: {result['train_result'].training_loss:.4f}")
    print(f"  Final Validation Loss: {result['eval_result']['eval_loss']:.4f}")
    print(f"  Training Time: {result['train_result'].metrics['train_runtime']:.2f} seconds")

# Save results to file
import json
from datetime import datetime

results_summary = {
    'timestamp': datetime.now().strftime("%Y-%m-%d_%H-%M-%S"),
    'results': [{
        'rank': r['rank'],
        'train_loss': float(r['train_result'].training_loss),
        'eval_loss': float(r['eval_result']['eval_loss']),
        'runtime': float(r['train_result'].metrics['train_runtime'])
    } for r in results]
}

with open('training_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print("\nResults saved to training_results.json")


Starting training for LoRA rank 4 (Configuration 1/3)

Preparing model with LoRA rank 4

Trainable parameters:
trainable params: 1,572,864 || all params: 356,396,032 || trainable%: 0.4413

Training Statistics:
Total training steps: 2636
Evaluation every 500 steps
Save checkpoint every 500 steps

Starting training...


Step,Training Loss,Validation Loss
500,0.9161,0.874072
1000,0.8922,0.851441
1500,0.8669,0.840647
2000,0.8326,0.836057
2500,0.8354,0.8348



Running final evaluation...



Saving final model for rank 4...


adapter_model.safetensors:   0%|          | 0.00/6.32M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]


Results for LoRA rank 4:
Final training loss: 0.9366
Final validation loss: 0.8348
Training time: 1143.76 seconds

Starting training for LoRA rank 8 (Configuration 2/3)

Preparing model with LoRA rank 8

Trainable parameters:
trainable params: 3,145,728 || all params: 357,968,896 || trainable%: 0.8788

Training Statistics:
Total training steps: 2636
Evaluation every 500 steps
Save checkpoint every 500 steps

Starting training...


Step,Training Loss,Validation Loss
500,0.9182,0.875417
1000,0.8936,0.852435
1500,0.868,0.841749
2000,0.833,0.836736
2500,0.8355,0.835198



Running final evaluation...



Saving final model for rank 8...


adapter_model.safetensors:   0%|          | 0.00/12.6M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]


Results for LoRA rank 8:
Final training loss: 0.9377
Final validation loss: 0.8352
Training time: 1143.25 seconds

Starting training for LoRA rank 16 (Configuration 3/3)

Preparing model with LoRA rank 16

Trainable parameters:
trainable params: 6,291,456 || all params: 361,114,624 || trainable%: 1.7422

Training Statistics:
Total training steps: 2636
Evaluation every 500 steps
Save checkpoint every 500 steps

Starting training...


Step,Training Loss,Validation Loss
500,0.9198,0.877109
1000,0.8935,0.853266
1500,0.8683,0.841896
2000,0.8333,0.836504
2500,0.8357,0.835022



Running final evaluation...



Saving final model for rank 16...


adapter_model.safetensors:   0%|          | 0.00/25.2M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.44G [00:00<?, ?B/s]


Results for LoRA rank 16:
Final training loss: 0.9376
Final validation loss: 0.8350
Training time: 1154.91 seconds

Training Complete! Final Results:

Rank 4:
  Final Training Loss: 0.9366
  Final Validation Loss: 0.8348
  Training Time: 1143.76 seconds

Rank 8:
  Final Training Loss: 0.9377
  Final Validation Loss: 0.8352
  Training Time: 1143.25 seconds

Rank 16:
  Final Training Loss: 0.9376
  Final Validation Loss: 0.8350
  Training Time: 1154.91 seconds

Results saved to training_results.json
