In [1]:
!pip install -q accelerate peft transformers torch trl datasets isbnlib tqdm langchain chromadb tiktoken langchain-community sentence-transformers

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.5/52.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m628.3/628.3 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m37.1 MB/s[0m eta [36m0:0

In [2]:
import os
import gc
os.environ["TORCHDYNAMO_DISABLE"] = "1"
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    Trainer,
    pipeline,
    logging,
)

model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Use the end-of-sequence token as padding token
model = AutoModelForCausalLM.from_pretrained(model_name).to('cuda')

# Set pad token for the model as well
model.config.pad_token_id = tokenizer.eos_token_id

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [3]:
import random
class BookDataset(torch.utils.data.Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        # Format the text in a more structured way
        text = self.texts[idx]

        item = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        labels = item['input_ids'].clone()
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            'input_ids': item['input_ids'].squeeze(),
            'attention_mask': item['attention_mask'].squeeze(),
            'labels': labels.squeeze()
        }

    def __len__(self):
        return len(self.texts)

    def get_random_samples(self, seed, num_samples):
        random.seed(seed)
        sampled_texts = random.sample(self.texts, num_samples)
        return sampled_texts

In [4]:
dataset = load_dataset("P1ayer-1/isbndb-full-database")
dataset = dataset.shuffle(seed=42)
dataset = dataset['train'].select(range(20000))
df = dataset.to_pandas()

# 4. Data preprocessing
# Clean date_published
df['date_published'] = df['date_published'].astype(str).str.extract(r"((?:19|20)\d{2})", expand=False)

# Standardize ISBN13
from isbnlib import to_isbn13
def standardize_isbn(isbn):
    try:
        return to_isbn13(isbn)
    except:
        return None
df['isbn13'] = df['isbn13'].apply(standardize_isbn)


# Remove missing values and duplicates
df = df.dropna(subset=['title', 'authors', 'date_published'])
df = df.drop_duplicates(subset=['title', 'isbn13'])

# 5. Prepare text for tokenization
def prepare_text(row):
    return f'''
    What is the isbn13 number of the book with these details :
    Title: {row['title']}
    Long Title: {row['title_long']}
    Author(s): {row['authors']}
    Language: {row['language']}
    Published: {row['date_published']}
    ISBN: {row['isbn']}
    The ISBN13 number is {row['isbn13']}'''

# Update texts with new format
texts = df.apply(prepare_text, axis=1)

# Create dataset
dataset = BookDataset(texts.tolist(), tokenizer, max_length=256)

# Test the dataset
sample = dataset[0]
print("Input shape:", sample['input_ids'].shape)
print("Attention mask shape:", sample['attention_mask'].shape)
print("Labels shape:", sample['labels'].shape)
print("\nDecoded text:")
print(tokenizer.decode(sample['input_ids'][sample['attention_mask'] == 1]))

# Optional: Check how many tokens are actually being used
num_tokens = (sample['attention_mask'] == 1).sum()
print(f"\nNumber of actual tokens (non-padding): {num_tokens}")

# Set random seeds for reproducibility
import numpy as np
import torch
np.random.seed(42)
torch.manual_seed(42)
# Calculate subset size and validation split
SUBSET_FRACTION = 1  # 1% of full dataset
VALID_FRACTION = 0.1   # 10% of subset for validation
# Convert texts to list if it's a pandas Series
texts_list = texts.tolist() if hasattr(texts, 'tolist') else texts
subset_size = int(len(texts_list) * SUBSET_FRACTION)

# Create subset indices and get subset
subset_indices = np.random.choice(len(texts_list), size=subset_size, replace=False)
texts_subset = [texts_list[i] for i in subset_indices]

# Split subset into train and validation
valid_size = int(len(texts_subset) * VALID_FRACTION)
train_size = len(texts_subset) - valid_size

train_texts = texts_subset[:-valid_size]
valid_texts = texts_subset[-valid_size:]

# Create train and validation datasets
train_dataset = BookDataset(train_texts, tokenizer, max_length=200)
valid_dataset = BookDataset(valid_texts, tokenizer, max_length=200)

# Print dataset sizes
print(f"Full dataset size: {len(texts_list):,}")
print(f"Subset size: {len(texts_subset):,}")
print(f"Training set size: {len(train_dataset):,}")
print(f"Validation set size: {len(valid_dataset):,}")

# Verify samples from both splits
def inspect_dataset(dataset, name):
    sample = dataset[0]
    print(f"\n{name} Sample:")
    print("Input shape:", sample['input_ids'].shape)
    print("Decoded text:")
    print(tokenizer.decode(sample['input_ids'][sample['attention_mask'] == 1]))
    num_tokens = (sample['attention_mask'] == 1).sum()
    print(f"Number of tokens: {num_tokens}")

inspect_dataset(train_dataset, "Training")
inspect_dataset(valid_dataset, "Validation")

README.md:   0%|          | 0.00/671 [00:00<?, ?B/s]

(…)-00000-of-00011-8c76aaae89c32750.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

(…)-00001-of-00011-0570d9f1f07ee5ea.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

(…)-00002-of-00011-43bde59cf032f28e.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

(…)-00003-of-00011-cc6f9f6e8311bb1d.parquet:   0%|          | 0.00/300M [00:00<?, ?B/s]

(…)-00004-of-00011-c0423bfbe2df7a76.parquet:   0%|          | 0.00/321M [00:00<?, ?B/s]

(…)-00005-of-00011-bfd8f039d2714da0.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

(…)-00006-of-00011-18be629cdddd6f97.parquet:   0%|          | 0.00/223M [00:00<?, ?B/s]

(…)-00007-of-00011-ea8e09c51301bce7.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

(…)-00008-of-00011-bff7d5ac18f4238c.parquet:   0%|          | 0.00/297M [00:00<?, ?B/s]

(…)-00009-of-00011-84ef5d0ca46f90be.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

(…)-00010-of-00011-dfef6d372d63bc4c.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/28086774 [00:00<?, ? examples/s]

Input shape: torch.Size([256])
Attention mask shape: torch.Size([256])
Labels shape: torch.Size([256])

Decoded text:

    What is the isbn13 number of the book with these details :
    Title: Digital Photography All-in-One Desk Reference For Dummies
    Long Title: Digital Photography All-in-One Desk Reference For Dummies
    Author(s): ['Busch, David D.']
    Language: en
    Published: 2008
    ISBN: 0470401958
    The ISBN13 number is 9780470401958

Number of actual tokens (non-padding): 111
Full dataset size: 18,745
Subset size: 18,745
Training set size: 16,871
Validation set size: 1,874

Training Sample:
Input shape: torch.Size([200])
Decoded text:

    What is the isbn13 number of the book with these details :
    Title: Santuario
    Long Title: Santuario
    Author(s): ['Faulkner, William']
    Language: es
    Published: 1982
    ISBN: 847530088X
    The ISBN13 number is 9788475300887
Number of tokens: 92

Validation Sample:
Input shape: torch.Size([200])
Decoded text:

    W

In [5]:
dataset_sample = train_dataset.get_random_samples(50,200)

In [6]:
from langchain.docstore.document import Document

# Convert dataset_sample to LangChain Documents
documents = [Document(page_content=text) for text in dataset_sample]

In [7]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# Load HuggingFace embedding model
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

# Create vector store using Chroma
vector_store = Chroma.from_documents(documents, embeddings)

  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
retriever = vector_store.as_retriever()

In [9]:
from langchain.chains import RetrievalQA
from transformers import pipeline
from langchain.llms import HuggingFacePipeline

# # Create a Hugging Face pipeline with GPU support
# gpt2_pipeline = pipeline(
#     "text-generation",
#     model="gpt2-large",
#     tokenizer=tokenizer,
#     max_length=1024 ,
#     device=0  # Use GPU (set to -1 for CPU)
# )

# # Wrap the pipeline in a LangChain-compatible LLM
# llm = HuggingFacePipeline(pipeline=gpt2_pipeline)



In [10]:
from langchain.chains import RetrievalQA

# # Create RAG chain with the Hugging Face LLM
# rag_chain = RetrievalQA.from_chain_type(
#     retriever=retriever,
#     chain_type="stuff",  # Default document combination method
#     llm=llm  # Use the Hugging Face LLM wrapped in LangChain
# )

In [11]:
def get_rag_pipeline(model_name):
    # Create a Hugging Face pipeline with GPU support
    gpt2_pipeline = pipeline(
        "text-generation",
        model=model_name,
        tokenizer=tokenizer,
        max_length=1024 ,
        device=0  # Use GPU (set to -1 for CPU)
    )

    # Wrap the pipeline in a LangChain-compatible LLM
    llm = HuggingFacePipeline(pipeline=gpt2_pipeline)
    rag_chain = RetrievalQA.from_chain_type(
        retriever=retriever,
        chain_type="stuff",  # Default document combination method
        llm=llm  # Use the Hugging Face LLM wrapped in LangChain
    )
    return rag_chain

In [12]:
import re

def get_isbn13(text):
    isbn13_match = re.search(r"\b\d{13}\b", text)
    return isbn13_match.group(0) if isbn13_match else None

def infer(prompt,rag_chain):
  result = rag_chain.run(prompt)
  splits = result.split('Helpful Answer:')
  ans_from_prompt = get_isbn13(splits[0])
  ans_from_rag = get_isbn13(splits[1])
  return ans_from_prompt == ans_from_rag


In [13]:
test_samples = []
for x in dataset_sample:
   split_longtitle = x.split('Long Title:')
   split_title = split_longtitle[0].split('Title:')
   title = split_title[1]
   test_samples.append("What is the isbn number of the book with title "+title+" ?")

In [16]:
from tqdm import tqdm


MODEL_NAMES = [
    "gpt2",
    "gpt2-medium",
    "gpt2-large"
]

RAG_ACCURACY = {}
for model in MODEL_NAMES:
    rag_chain = get_rag_pipeline(model)
    count = 0
    for x in tqdm(test_samples):
      if infer(x,rag_chain):
        count+=1
    RAG_ACCURACY[model] = count/len(test_samples)
    print(model, count/len(test_samples))

100%|██████████| 200/200 [12:29<00:00,  3.75s/it]


gpt2 0.005


100%|██████████| 200/200 [19:07<00:00,  5.74s/it]


gpt2-medium 0.025


100%|██████████| 200/200 [23:04<00:00,  6.92s/it]

gpt2-large 0.07





In [17]:
RAG_ACCURACY

{'gpt2': 0.005, 'gpt2-medium': 0.025, 'gpt2-large': 0.07}