In [None]:
!pip install transformers datasets accelerate bitsandbytes peft trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.18.1-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2

In [None]:
from huggingface_hub import notebook_login
import torch
import bitsandbytes
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
import accelerate

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#%%
model_id = "meta-llama/Llama-2-7b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="fp4",  # Changed "nf4" to "fp4"
    bnb_4bit_compute_dtype=torch.float16,
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset
import itertools
import re
import csv

DATASET_NAME = "din0s/asqa"
SPLIT_NAME = "train"
NUM_EXAMPLES = 50
OUTPUT_FILENAME = "asqa_examples.csv"

In [None]:
try:
  dataset = load_dataset(DATASET_NAME, streaming=True, split=SPLIT_NAME)
except Exception as e:
  print(f"Error loading dataset: {e}")
  print("Please ensure you have an active internet connection.")

print(f"Dataset loaded. Preparing to extract {NUM_EXAMPLES} examples...")

with open(OUTPUT_FILENAME, 'w', newline='', encoding='utf-8') as csvfile:
  fieldnames = ['question', 'response', 'context']
  writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

  writer.writeheader()

  count = 0
  for example in dataset:
    if count >= NUM_EXAMPLES:
      break

    question = example.get('ambiguous_question')

    if example.get('annotations') and example['annotations'][0].get('long_answer'):
      response = example['annotations'][0]['long_answer']
    else:
      response = "N/A"

    context = example.get('wikipages')[0].get('url')

    writer.writerow({
      'question': question,
      'response': response,
      'context': context
    })

    count += 1
    if count % 100 == 0:
      print(f"Processed {count}/{NUM_EXAMPLES} examples...")

print("-" * 50)
print(f"Successfully extracted {count} examples.")
print(f"Data saved to '{OUTPUT_FILENAME}'")
print("-" * 50)

README.md:   0%|          | 0.00/4.83k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Dataset loaded. Preparing to extract 50 examples...
--------------------------------------------------
Successfully extracted 50 examples.
Data saved to 'asqa_examples.csv'
--------------------------------------------------


In [None]:
import pandas as pd

df = pd.read_csv("asqa_examples.csv")

In [None]:
from os import setgroups
import requests
from bs4 import BeautifulSoup
import spacy
import random
import pandas as pd
import time

def getText(link):
  try:
    response = requests.get(link)
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')

    content_div = soup.find(id='mw-content-text')

    text = content_div.get_text(separator=' ', strip=True)

    if len(text) < 500:
      return text
    else:
      ans = text[:500]
      if len(text) > 1000:
        ans += text[-500:]
      return ans

  except requests.exceptions.RequestException as e:
    print(f"Error fetching the URL: {e}")
    return None

nlp = spacy.load("en_core_web_sm")
def returnSentences(text):
  doc = nlp(text)
  sentences = [sent.text for sent in doc.sents]
  return sentences

allPrompts = []

def createMultiplePrompts(sentences, question, context):
  currNumSentences = 0
  currResponse = ""

  while currNumSentences < len(sentences):
    chunk = ""

    sentencesLeft = len(sentences) - currNumSentences
    chunkSize = random.randint(1, min(sentencesLeft, 3))

    for i in range(chunkSize):
      chunk += sentences[currNumSentences]
      currNumSentences += 1

    allPrompts.append(createPrompt(question, context, currResponse, chunk))
    currResponse += chunk


def createPrompt(question, context, currResponse, outputSegment):
  prompt = (
        "Instructions: You will be provided with an instruction, evidence, output segment(1-3 sentences), and preceding sentnces (optional). "
        "If the preceding sentences are given, the output should be the segment that follows. Your task is to determine whether the information in the "
        "output segment can be fully verified by the evidence or if it requires further external validation. There are three cases: "
        "-If the output segment can be verified solely with the evidence, then respond with [Continue]. "
        "-If the segment doesn't require any factual verification, then respond with [No]. "
        "-If the segment cannot be verified with the given evidence and requires external knowledge, respond with [Retrieve]."
    )

  prompt += '\n Instruction: ' + str(question)
  prompt += '\nPreceding sentences: ' + str(currResponse)
  prompt += '\nEvidence: ' + str(context)
  prompt += '\nOutput segment: ' + str(outputSegment)

  return prompt

createMultiplePrompts(returnSentences("The 2015 - 2016 season's ncaa national football championship game was played between the Clemson Tigers and the Alabama Crimson Tide on January 11, 2016. The Alabama Crimson Tide won the game by holding off the undefeated Clemson Tigers 45–40 in the fourth quarter."),
                      "Who won the 2016 ncaa football national championship?", getText("https://en.wikipedia.org/wiki/2015%20College%20Football%20Playoff%20National%20Championship"))

df = pd.read_csv('asqa_examples.csv')

for index, row in df.iterrows():
  # if index < 30:
  createMultiplePrompts(returnSentences(row['response']), row['question'], getText(row['context']))
  time.sleep(2)

In [None]:
# for prompt in allPrompts:
#   print(prompt)
#   print()
#   print()

print(len(allPrompts))

92


In [None]:
!pip install langchain_openai

Collecting langchain_openai
  Downloading langchain_openai-0.3.21-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-core<1.0.0,>=0.3.64 (from langchain_openai)
  Downloading langchain_core-0.3.64-py3-none-any.whl.metadata (5.8 kB)
Collecting langsmith<0.4,>=0.3.45 (from langchain-core<1.0.0,>=0.3.64->langchain_openai)
  Downloading langsmith-0.3.45-py3-none-any.whl.metadata (15 kB)
Downloading langchain_openai-0.3.21-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.2/65.2 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langchain_core-0.3.64-py3-none-any.whl (438 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.1/438.1 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langsmith-0.3.45-py3-none-any.whl (363 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.0/363.0 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langsmith, langchain-cor

In [None]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import os

os.environ["OPENAI_API_KEY"] = "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"
llm = ChatOpenAI(model_name="gpt-4")

In [None]:
responses = []
for prompt in allPrompts:
  response = llm.invoke(prompt)
  responses.append(response.content)

print(responses)

['[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[No]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Continue]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Continue]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Continue]', '[Retrieve]', '[Retrieve]', 'Retrieve', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Retrieve]', '[Continue]',

In [None]:
print(responses.count('[Continue]'))
print(responses.count('[No]'))
print(responses.count('[Retrieve]'))

7
1
82


In [None]:
from datasets import Dataset

tokenizer.padding_side = "right"
new_tokens = ["[Continue]", "[No]", "[Retrieve]"]
added_tokens = tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(tokenizer))

for name, param in model.named_parameters():
  if 'embed_tokens' in name:
    param.requires_grad = True

formatted_texts = [f"[INST] {p} [/INST] {a}" for p, a in zip(allPrompts, responses)]

data = Dataset.from_dict({"text": formatted_texts})
tokenized_data = data.map(lambda examples: tokenizer(examples["text"], truncation=True, padding="max_length"), batched=True)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Map:   0%|          | 0/92 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
lora_config = LoraConfig(
    r=8,  # Rank of the update matrices. Lower ranks are simpler, higher ranks are more expressive.
    lora_alpha=16, # A scaling factor for the LoRA weights.
    target_modules=["q_proj", "v_proj"], # Target the query and value projections in the attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
pip install --upgrade trl



In [None]:
import transformers
import trl
from trl import SFTTrainer, SFTConfig

In [None]:
print("--- Library Versions ---")
print("PyTorch:", torch.__version__)
print("Transformers:", transformers.__version__)
print("Accelerate:", accelerate.__version__)
print("BitsAndBytes:", bitsandbytes.__version__)
print("TRL:", trl.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version from Torch:", torch.version.cuda)

--- Library Versions ---
PyTorch: 2.6.0+cu124
Transformers: 4.52.4
Accelerate: 1.7.0
BitsAndBytes: 0.46.0
TRL: 0.18.1
CUDA available: True
CUDA version from Torch: 12.4


In [None]:
sft_config = SFTConfig(
    output_dir="outputs",
    # SFT-specific parameters
    dataset_text_field="text",
    max_seq_length=512,
    packing=False, # We don't want to pack short sequences together
    # Training parameters
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    gradient_checkpointing=True,
    learning_rate=2e-4,
    # Other TrainingArguments parameters
    fp16=True,
    save_total_limit=3,
    logging_steps=10,
    optim="paged_adamw_8bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
)

trainer = SFTTrainer(
    model=model, # Pass the original, non-PEFT model
    args=sft_config, # Pass the SFTConfig object
    peft_config=lora_config, # Pass the LoraConfig object
    train_dataset=data, # The trainer will handle tokenization
    processing_class=tokenizer,
)

torch.cuda.empty_cache()

trainer.train()
adapter_path = "./finalModel"
trainer.save_model(adapter_path)

Converting train dataset to ChatML:   0%|          | 0/92 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/92 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/92 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/92 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33manvith-kotha[0m ([33manvith-kotha-university-of-illinois-urbana-champaign[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,6.4119


