In [1]:
import os, gc, torch
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("Cleaned. CUDA available:", torch.cuda.is_available())

Cleaned. CUDA available: True


In [2]:
# Cell 2: Set HF token safely (do NOT print it)

import os
from getpass import getpass

if "HF_TOKEN" not in os.environ:
    os.environ["HF_TOKEN"] = getpass("Paste your HF token (input hidden): ")

print("HF_TOKEN set:", "HF_TOKEN" in os.environ)


Paste your HF token (input hidden):  Â·Â·Â·Â·Â·Â·Â·Â·


HF_TOKEN set: True


In [3]:
# Cell 2b: snapshot_download Llama-2-7b-chat-hf to local dir

from huggingface_hub import snapshot_download

LOCAL_MODEL_DIR = "./local_llama2_model"

snapshot_download(
    repo_id="meta-llama/Llama-2-7b-chat-hf",
    local_dir=LOCAL_MODEL_DIR,
    token=os.environ["HF_TOKEN"],
    local_dir_use_symlinks=False
)

print("Downloaded to:", LOCAL_MODEL_DIR)


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

Downloaded to: ./local_llama2_model


In [4]:
# Cell 3: Config

import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

BASE_MODEL = LOCAL_MODEL_DIR
OUTPUT_DIR = "llama2_7b_unt_lora_rag"

# Training params (agresivos para que quepa en 20GB)
MAX_SEQ_LEN = 192
BATCH_SIZE = 1
GRAD_ACC = 32
EPOCHS = 2
LR = 2e-4
SEED = 42

torch.manual_seed(SEED)


Device: cuda


<torch._C.Generator at 0x7b7bc1bef690>

In [5]:
# Cell 4: Load CSV and split

import pandas as pd
from datasets import Dataset

csv_path = "qa_dataset.csv"  # tu archivo
df = pd.read_csv(csv_path)

required = {"question", "context", "answer"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Missing columns: {missing}")

df = df.dropna(subset=["question", "context", "answer"]).reset_index(drop=True)
print("Rows:", len(df))

ds = Dataset.from_pandas(df)
split = ds.train_test_split(test_size=0.1, seed=SEED)
train_raw, test_raw = split["train"], split["test"]
train_raw, test_raw


Rows: 601


(Dataset({
     features: ['question', 'context', 'answer'],
     num_rows: 540
 }),
 Dataset({
     features: ['question', 'context', 'answer'],
     num_rows: 61
 }))

In [6]:
# Cell 5: Build SFT text field

def build_text(ex):
    q = str(ex["question"]).strip()
    c = str(ex["context"]).strip()
    a = str(ex["answer"]).strip()

    # Prompt simple y consistente para SFT
    text = f"""You are a professional assistant for international students at the University of North Texas (UNT).
Always answer in the SAME language as the user's question (English question -> English answer, Spanish question -> Spanish answer).
Be precise, specific, and factual. Use the context.

[CONTEXT]
{c}

[QUESTION]
{q}

[ANSWER]
{a}
"""
    return {"text": text}

train_ds = train_raw.map(build_text)
test_ds  = test_raw.map(build_text)

print(train_ds[0]["text"][:700])


Map:   0%|          | 0/540 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

You are a professional assistant for international students at the University of North Texas (UNT).
Always answer in the SAME language as the user's question (English question -> English answer, Spanish question -> Spanish answer).
Be precise, specific, and factual. Use the context.

[CONTEXT]
If you have trouble, don't hesitate to ask for help. The International Student Office is your primary resource. They can give you advice on cultural adjustment, visa rules, or refer you to academic and mental health counselors on campus. Universities have many resources to help you succeed.

[QUESTION]
What should I do if I have trouble adapting?

[ANSWER]
If you're struggling, contact the Internationa


In [7]:
# Cell 6 (REPLACED): Load tokenizer + base model sharded across GPU0+GPU1

import os, torch
from transformers import AutoTokenizer, AutoModelForCausalLM

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU count:", torch.cuda.device_count())
    for i in range(torch.cuda.device_count()):
        print(i, torch.cuda.get_device_name(i))

# IMPORTANT: allow both GPUs (if your env restricts GPUs, set it here)
# If you know you have GPU:0 and GPU:1, ensure they are visible:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"  # uncomment ONLY if needed before importing torch

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load sharded model (model parallel)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto",              # âœ… spreads layers across available GPUs
    low_cpu_mem_usage=True
)

# Sync embeddings
model.resize_token_embeddings(len(tokenizer))

# Reduce VRAM usage
model.gradient_checkpointing_enable()
model.config.use_cache = False

model.train()

print("Loaded model with device_map='auto'.")
print("First param device:", next(model.parameters()).device)



CUDA available: True
GPU count: 2
0 NVIDIA RTX A4500
1 NVIDIA RTX A4500


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loaded model with device_map='auto'.
First param device: cuda:0


In [8]:
# Cell 7: Apply LoRA

from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=4,                 # mÃ¡s pequeÃ±o para que quepa
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.train()

# sanity: debe haber trainables
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / Total: {total:,} ({100*trainable/total:.4f}%)")


Trainable: 2,097,152 / Total: 6,740,512,768 (0.0311%)


In [9]:
# Cell 8: Tokenize datasets

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=MAX_SEQ_LEN,
        padding="max_length"
    )

train_tok = train_ds.map(tokenize_fn, batched=True, remove_columns=train_ds.column_names)
test_tok  = test_ds.map(tokenize_fn, batched=True, remove_columns=test_ds.column_names)

# Labels para causal LM
train_tok = train_tok.map(lambda x: {"labels": x["input_ids"]})
test_tok  = test_tok.map(lambda x: {"labels": x["input_ids"]})

train_tok, test_tok


Map:   0%|          | 0/540 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Map:   0%|          | 0/540 [00:00<?, ? examples/s]

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

(Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 540
 }),
 Dataset({
     features: ['input_ids', 'attention_mask', 'labels'],
     num_rows: 61
 }))

In [10]:
# Cell 9: Train with Trainer (Adafactor to reduce optimizer memory)

from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
bf16_ok = False
if torch.cuda.is_available():
    try:
        bf16_ok = torch.cuda.is_bf16_supported()
    except Exception:
        bf16_ok = False

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC,
    learning_rate=LR,
    fp16=False,
    bf16 = bf16_ok,
    logging_steps=25,
    save_steps=200,
    save_total_limit=2,
    evaluation_strategy="no",
    report_to="none",
    seed=SEED,
    optim="adafactor",
    # âœ… clave cuando usas device_map model parallel:
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Saved to:", OUTPUT_DIR)



2025-12-27 12:23:43.296034: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Step,Training Loss
25,1.3837


Saved to: llama2_7b_unt_lora_rag




In [25]:
# Cell 10: Build LangChain Documents from CSV (context + answer)

from langchain_core.documents import Document

docs = []
for i, row in df.iterrows():
    q = str(row["question"]).strip()
    c = str(row["context"]).strip()
    a = str(row["answer"]).strip()

    content = f"CONTEXT:\n{c}\n\nREFERENCE_ANSWER:\n{a}"
    docs.append(Document(page_content=content, metadata={"row_id": int(i), "question": q}))

print("Docs:", len(docs))
print(docs[0].page_content[:400])




Docs: 601
CONTEXT:
OPT is a temporary employment that is directly related to an F-1 studentâ€™s major area of study.

REFERENCE_ANSWER:
OPT allows international students to work in their field of study for up to 12 months after graduation.


In [26]:
# Cell 11: FAISS vectorstore (CPU embeddings to avoid any CUDA issues)

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={"device": "cpu"}   # âœ… force CPU for embedding (retrieval won't touch CUDA)
)

vectorstore = FAISS.from_documents(docs, embeddings)

FAISS_DIR = "faiss_unt_index_llama2"
vectorstore.save_local(FAISS_DIR)

print("âœ… FAISS saved to:", FAISS_DIR)






âœ… FAISS saved to: faiss_unt_index_llama2


In [27]:
# Cell 12: Load fine-tuned LoRA model for inference (device_map auto) + generate_text()

import torch
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

tok = AutoTokenizer.from_pretrained(OUTPUT_DIR, use_fast=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

# Use BF16 if supported, else FP16 (inference)
dtype = torch.float16
if torch.cuda.is_available():
    try:
        if torch.cuda.is_bf16_supported():
            dtype = torch.bfloat16
    except Exception:
        pass
else:
    dtype = torch.float32

print("Inference dtype:", dtype)

# Load sharded base model (uses GPU0+GPU1 if available)
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=dtype,
    device_map="auto",
    low_cpu_mem_usage=True
)

base.resize_token_embeddings(len(tok))
base.config.use_cache = True

# Attach LoRA adapters
ft = PeftModel.from_pretrained(base, OUTPUT_DIR)
ft.eval()

@torch.no_grad()
def generate_text(
    prompt: str,
    max_new_tokens: int = 256,
    temperature: float = 0.3,
    top_p: float = 0.9,
    repetition_penalty: float = 1.1,
    max_prompt_tokens: int = 512
) -> str:
    inputs = tok(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=max_prompt_tokens,
        padding=False
    )

    # For sharded models, move inputs to the first device
    first_device = next(ft.parameters()).device
    inputs = {k: v.to(first_device) for k, v in inputs.items()}

    out = ft.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs.get("attention_mask", None),
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        pad_token_id=tok.eos_token_id,
        eos_token_id=tok.eos_token_id
    )

    full = tok.decode(out[0], skip_special_tokens=True)
    if full.startswith(prompt):
        return full[len(prompt):].strip()
    return full.strip()

print("âœ… Inference ready (generate_text available).")




Inference dtype: torch.bfloat16


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



ValueError: weight is on the meta device, we need a `value` to put in on 1.

In [28]:
# Cell 13: LangChain RAG chain (RetrievalQA) using generate_text()

from typing import Any, List, Optional
from langchain_core.language_models.llms import LLM
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

class LocalLLM(LLM):
    @property
    def _llm_type(self) -> str:
        return "local_llama2_generate"

    def _call(self, prompt: str, stop: Optional[List[str]] = None, **kwargs: Any) -> str:
        text = generate_text(prompt)
        if stop:
            for s in stop:
                if s in text:
                    text = text.split(s)[0]
        return text

llm = LocalLLM()

rag_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""You are a professional assistant for international students at the University of North Texas (UNT).
Always answer in the SAME language as the user's question (English question -> English answer, Spanish question -> Spanish answer).
Use ONLY the provided context. If the context does not contain the answer, say what is missing and what the student should check next.

[RETRIEVED CONTEXT]
{context}

[USER QUESTION]
{question}

[FINAL ANSWER]
"""
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": rag_prompt},
    return_source_documents=True
)

def chat_rag(question: str, k: int = 3):
    qa_chain.retriever.search_kwargs["k"] = k
    res = qa_chain({"query": question})
    ans = res["result"]
    srcs = res.get("source_documents", [])
    return ans, [(d.metadata.get("row_id"), d.metadata.get("question")) for d in srcs]

print("âœ… RAG chain ready (chat_rag available).")



âœ… RAG chain ready (chat_rag available).


In [None]:
# Cell 14: Quick tests

ans_es, src_es = chat_rag("Â¿CuÃ¡l es el proceso para obtener el I-20 despuÃ©s de ser admitido?", k=3)
print("---- RAG ES ----")
print(ans_es)
print("\nSources:")
for rid, qq in src_es:
    print(f"- {rid}: {qq}")

ans_en, src_en = chat_rag("What are typical housing options for international graduate students at UNT?", k=3)
print("\n---- RAG EN ----")
print(ans_en)
print("\nSources:")
for rid, qq in src_en:
    print(f"- {rid}: {qq}")