In [1]:
"1. Install + Imports"

'1. Install + Imports'

In [2]:
# ============================================================
# Stage 1: Train a MiniLM Retriever Using MNRL
# ============================================================

import os
import random
import torch
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader
import chromadb


In [3]:
#Load MiniLM Chroma Collection (From Notebook 0)

In [4]:
# Must match Notebook 0 settings
CHROMA_DIR = "chroma_pg16_minilm"
COLLECTION_NAME = "pg16_minilm"

client = chromadb.PersistentClient(path=CHROMA_DIR)
collection = client.get_collection(COLLECTION_NAME)

print("Loaded Chroma collection:", COLLECTION_NAME)

# Retrieve all documents
docs = collection.get(include=["documents", "metadatas"])

documents = docs["documents"]
metadatas = docs["metadatas"]
ids = docs["ids"]

print("Loaded", len(documents), "chunks from Chroma.")


Loaded Chroma collection: pg16_minilm
Loaded 6865 chunks from Chroma.


In [5]:
#Randomly Sample 100 Passages for MNRL Training

In [6]:
NUM_PAIRS = 100  # required by assignment guidelines
random.seed(42)

sampled_chunks = random.sample(documents, NUM_PAIRS)

print("Sampled", len(sampled_chunks), "chunks for supervised pair generation.")


Sampled 100 chunks for supervised pair generation.


In [7]:
#Define Question Generator (LLM-Based)

In [8]:
from transformers import pipeline

generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",
    device=0 if torch.cuda.is_available() else -1
)


def generate_question(passage):
    prompt = f"""
    Generate a clear question whose answer is contained in the following passage.
    Your job is to help train a retrieval model.
    
    Passage:
    {passage}

    Respond in JSON with:
    {{
        "question": "..."
    }}
    """

    response = generator(prompt, max_new_tokens=128)[0]["generated_text"]

    # Try to extract JSON field
    try:
        q = response.split('"question":')[1].split('"')[1]
        return q
    except:
        return "What does this passage describe?"


Device set to use cuda:0


In [9]:
#Build the Training CSV (pg16_train_pairs.csv)

In [10]:
train_pairs = []

for passage in tqdm(sampled_chunks):
    q = generate_question(passage)
    train_pairs.append({
        "query": q,
        "positive_passage": passage
    })

df = pd.DataFrame(train_pairs)
df.to_csv("pg16_train_pairs.csv", index=False)

print("Created pg16_train_pairs.csv with", len(df), "pairs!")


  0%|                                                   | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (896 > 512). Running this sequence through the model will result in indexing errors
 10%|████▏                                     | 10/100 [00:18<02:50,  1.89s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|█████████████████████████████████████████| 100/100 [01:52<00:00,  1.12s/it]

Created pg16_train_pairs.csv with 100 pairs!





In [11]:
#Load Training Pairs for MiniLM Model

In [12]:
df = pd.read_csv("pg16_train_pairs.csv")
df.head()


Unnamed: 0,query,positive_passage
0,What does this passage describe?,Internals 59.3. Foreign Data Wrapper Helper Fu...
1,What does this passage describe?,sort order. Table 9.53. Array Operators Operat...
2,What does this passage describe?,SQL Syntax more expressions (separated by comm...
3,What does this passage describe?,SQL Key Words Key Word PostgreSQL SQL:2023 SQL...
4,What does this passage describe?,Monitoring Database Activity Whenever VACUUM i...


In [13]:
#Load MiniLM Model for Fine-Tuning

In [14]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Move to GPU
model = model.to("cuda")

# Enable gradient checkpointing to reduce VRAM usage
model._first_module().auto_model.gradient_checkpointing_enable()

print("Model ready!")


Model ready!


In [15]:
#Clear GPU Cache Before Training

In [16]:
torch.cuda.empty_cache()
print("GPU memory cleaned.")


GPU memory cleaned.


In [17]:
print("df exists:", "df" in globals())
print("train_examples exists:", "train_examples" in globals())
print("train_dataloader exists:", "train_dataloader" in globals())
print("train_loss exists:", "train_loss" in globals())


df exists: True
train_examples exists: False
train_dataloader exists: False
train_loss exists: False


In [18]:
# ===========================================
# SAFETY REBUILD OF TRAINING DATA STRUCTURES
# ===========================================

# If df isn't loaded, load from CSV
if "df" not in globals():
    print("Reloading df...")
    df = pd.read_csv("pg16_train_pairs.csv")

# Rebuild training examples
train_examples = [
    InputExample(
        texts=[
            f"query: {row['query']}",
            f"passage: {row['positive_passage']}"
        ]
    )
    for _, row in df.iterrows()
]

# Rebuild dataloader and loss function
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)
train_loss = losses.MultipleNegativesRankingLoss(model)

print("Rebuilt training pipeline.")
print("Num training examples:", len(train_examples))


Rebuilt training pipeline.
Num training examples: 100


In [19]:
#Train MiniLM Retriever (VRAM-Safe)

In [20]:
output_path = "models/pg16-minilm-mnrl"

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    warmup_steps=int(0.1 * len(train_dataloader)),
    show_progress_bar=True,
    use_amp=True,          # FP16 mixed precision
    output_path=output_path
)

print("Stage 1 complete! Model saved to:", output_path)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


Stage 1 complete! Model saved to: models/pg16-minilm-mnrl
