In [1]:
import os, gc, torch

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU count:", torch.cuda.device_count())

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# ---- Paths (igual que tu script) ----
BASE_MODEL_DIR = "./local_llama2_model"          # snapshot_download
ADAPTER_DIR    = "llama2_7b_unt_lora_rag"        # OUTPUT_DIR del entrenamiento
FAISS_DIR      = "faiss_unt_index_llama2"        # index guardado
CSV_PATH       = "qa_dataset.csv"                # dataset original

assert os.path.isdir(BASE_MODEL_DIR), "Base model dir not found"
assert os.path.isdir(ADAPTER_DIR),    "Adapter dir not found"
assert os.path.isdir(FAISS_DIR),      "FAISS dir not found"
assert os.path.isfile(CSV_PATH),      "CSV not found"

print("‚úÖ Paths OK.")



CUDA available: True
GPU count: 2
‚úÖ Paths OK.


In [2]:
import pandas as pd
from datasets import Dataset

SEED = 42

df = pd.read_csv(CSV_PATH).dropna(subset=["question","context","answer"]).reset_index(drop=True)
ds = Dataset.from_pandas(df)

split = ds.train_test_split(test_size=0.1, seed=SEED)
test_raw = split["test"]

print("Total rows:", len(df))
print("Test rows:", len(test_raw))
print(test_raw[0])


Total rows: 601
Test rows: 61
{'question': '¬øQu√© es el OPT y qui√©n califica para aplicarlo?', 'context': 'OPT (Optional Practical Training) es una autorizaci√≥n de trabajo temporal para estudiantes con visa F-1 que les permite obtener experiencia laboral en su campo de estudio. Para calificar, debes haber estado inscrito a tiempo completo en una universidad de EE.UU. por al menos un a√±o acad√©mico completo y tener un estatus de visa F-1 v√°lido.', 'answer': 'El OPT es un permiso de trabajo temporal para estudiantes F-1. Para calificar, debes haber estudiado a tiempo completo por al menos un a√±o acad√©mico y tener un estatus de visa v√°lido.'}


In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    model_kwargs={"device": "cpu"}  # retrieval estable
)

vectorstore = FAISS.load_local(FAISS_DIR, embeddings, allow_dangerous_deserialization=True)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

print("‚úÖ FAISS loaded.")


  embeddings = HuggingFaceEmbeddings(


‚úÖ FAISS loaded.


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

def load_llm_with_fallback():
    tok = AutoTokenizer.from_pretrained(ADAPTER_DIR, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token

    # Try GPU first
    if torch.cuda.is_available():
        try:
            dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
            base = AutoModelForCausalLM.from_pretrained(
                BASE_MODEL_DIR,
                torch_dtype=dtype,
                device_map="auto",
                low_cpu_mem_usage=True
            )
            base.resize_token_embeddings(len(tok))
            base.config.use_cache = True

            ft = PeftModel.from_pretrained(base, ADAPTER_DIR)
            ft.eval()
            return tok, ft, "cuda"
        except Exception as e:
            print("‚ö†Ô∏è GPU load failed, falling back to CPU.")
            print("Reason:", repr(e))

    # CPU fallback
    dtype = torch.float32
    base = AutoModelForCausalLM.from_pretrained(BASE_MODEL_DIR, torch_dtype=dtype)
    base.resize_token_embeddings(len(tok))
    base.to("cpu")
    base.config.use_cache = True

    ft = PeftModel.from_pretrained(base, ADAPTER_DIR)
    ft.to("cpu")
    ft.eval()
    return tok, ft, "cpu"

tokenizer, model, MODEL_DEVICE = load_llm_with_fallback()
print("‚úÖ Model loaded on:", MODEL_DEVICE)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

‚úÖ Model loaded on: cuda


In [34]:
import re
from typing import List, Optional
from transformers import StoppingCriteria, StoppingCriteriaList

# --------------------------
# 1) Language helpers
# --------------------------

def detect_forced_language(q: str):
    ql = q.lower().strip()
    if re.search(r"\b(answer|respond)\s+in\s+english\b", ql) or "answer in english:" in ql:
        return "en"
    if re.search(r"\b(answer|respond)\s+in\s+spanish\b", ql) or "answer in spanish:" in ql or "en espa√±ol" in ql or "en espanol" in ql:
        return "es"
    return None

def guess_language(q: str):
    q = q.strip()
    ql = q.lower()

    if q.startswith(("¬ø", "¬°")) or any(ch in ql for ch in "√°√©√≠√≥√∫√±√º"):
        return "es"

    es_hits = sum(w in ql.split() for w in [
        "que","qu√©","como","c√≥mo","para","requisitos","documentos",
        "proceso","visa","i-20","sevis"
    ])
    en_hits = sum(w in ql.split() for w in [
        "what","how","requirements","documents",
        "process","apply","visa","i-20","sevis"
    ])

    # ‚úÖ INGL√âS POR DEFECTO
    return "es" if es_hits > en_hits else "en"

def target_language(q: str):
    forced = detect_forced_language(q)
    return forced if forced else guess_language(q)

# --------------------------
# 2) Postprocess: limpia markers
# --------------------------
def postprocess_answer(ans: str) -> str:
    a = ans.strip()

    junk_markers = [
        "[USER QUESTION]", "[FINAL ANSWER]", "[RETRIEVED CONTEXT]",
        "[TARGET_LANGUAGE=", "TARGET_LANGUAGE=", "Sources:", "SOURCES:"
    ]

    lines = a.splitlines()
    clean_lines = []
    for line in lines:
        l = line.strip()
        low = l.lower()
        if any(m.lower() in low for m in junk_markers):
            continue
        clean_lines.append(line)

    a = "\n".join(clean_lines).strip()
    a = re.sub(r"(?i)^\s*(final answer|respuesta final)\s*:\s*", "", a).strip()
    a = re.sub(r"(?i)^\s*\[target_language\s*=\s*(en|es)\s*\]\s*", "", a).strip()
    a = re.sub(r"\n{3,}", "\n\n", a).strip()
    return a

def hard_clean_and_cut(ans: str, lang: str) -> str:
    a = ans.strip()

    # 1) elimina tokens/labels basura
    a = re.sub(r"(?im)^\s*\[answer\]\s*", "", a).strip()
    a = re.sub(r"(?im)^\s*(answer|final answer)\s*:\s*", "", a).strip()

    # 2) si el modelo intenta seguir con otro Q/A, corta ah√≠
    cut_markers = [
        "\nQuestion:", "\nAnswer:", "\nQ:", "\nA:",
        "\nSources:", "\nSOURCES:",
        "\nWhat is", "\n¬øQu√© es", "\nQu'est-ce"
    ]
    for m in cut_markers:
        if m in a:
            a = a.split(m)[0].strip()

    # 3) corta si aparecen m√∫ltiples p√°rrafos y el segundo parece otra respuesta
    # (conservador: nos quedamos con el primer bloque)
    blocks = [b.strip() for b in re.split(r"\n\s*\n", a) if b.strip()]
    if len(blocks) >= 2:
        a = blocks[0].strip()

    # 4) √∫ltimo toque: quita l√≠neas sueltas tipo "Question" sin :
    a = re.sub(r"(?im)^\s*question\s*$", "", a).strip()
    a = re.sub(r"\n{3,}", "\n\n", a).strip()

    return a

# --------------------------
# 3) Stopping criteria: corta si intenta imprimir markers
# --------------------------
class StopOnSubsequence(StoppingCriteria):
    def __init__(self, stop_token_seqs: List[List[int]]):
        super().__init__()
        self.stop_token_seqs = stop_token_seqs

    def __call__(self, input_ids, scores, **kwargs):
        seq = input_ids[0].tolist()
        for stop_seq in self.stop_token_seqs:
            L = len(stop_seq)
            if L > 0 and len(seq) >= L and seq[-L:] == stop_seq:
                return True
        return False

STOP_STRINGS = [
    "[USER QUESTION]", "[FINAL ANSWER]", "[RETRIEVED CONTEXT]",
    "\nQuestion:", "\nAnswer:", "\nQ:", "\nA:",
    "\n[ANSWER]", "\nSources:", "\nSOURCES:",
    "\nQu'est-ce", "\n¬øQu√© es", "\nWhat is"
]
STOP_TOKEN_IDS = []
for s in STOP_STRINGS:
    ids = tokenizer.encode(s, add_special_tokens=False)
    if len(ids) > 0:
        STOP_TOKEN_IDS.append(ids)

stopping = StopOnSubsequence(STOP_TOKEN_IDS)
stopping_criteria = StoppingCriteriaList([stopping])

# --------------------------
# 4) Robust generation: slice by input token length
# --------------------------
@torch.no_grad()
def generate_text(prompt: str, max_new_tokens: int = 180, temperature: float = 0.7, top_p: float = 0.9):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)

    if MODEL_DEVICE == "cuda":
        first_device = next(model.parameters()).device
        inputs = {k: v.to(first_device) for k, v in inputs.items()}

    input_len = inputs["input_ids"].shape[-1]

    out = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs.get("attention_mask", None),
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=1.05,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        stopping_criteria=stopping_criteria
    )

    gen_ids = out[0][input_len:]  # ‚úÖ SOLO lo nuevo
    text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
    return postprocess_answer(text)

print("‚úÖ Generation ready (token-slicing + stop + postprocess).")


‚úÖ Generation ready (token-slicing + stop + postprocess).


In [35]:
from langchain_core.language_models.llms import LLM
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA

class LocalLLM(LLM):
    @property
    def _llm_type(self) -> str:
        return "local_llama2_rag"

    def _call(self, prompt: str, stop: Optional[List[str]] = None, **kwargs):
        text = generate_text(prompt)
        if stop:
            for s in stop:
                if s in text:
                    text = text.split(s)[0]
        return postprocess_answer(text)

llm = LocalLLM()

rag_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=(
        "You are a RAG assistant for international students.\n"
        "The user's question starts with a language tag: [LANG=en] or [LANG=es].\n\n"
        "STRICT OUTPUT RULES (must follow):\n"
        "1) Respond ONLY in the language specified by the tag.\n"
        "2) Output ONLY the final answer text. No labels, no markup.\n"
        "3) DO NOT write 'Question:', 'Answer:', 'Q:', '[ANSWER]', or any extra sections.\n"
        "4) DO NOT ask follow-up questions. DO NOT generate multiple Q&A pairs.\n"
        "5) If the context is insufficient, say you don't know and ask for the missing detail (same language).\n\n"
        "Context:\n{context}\n\n"
        "User question:\n{question}\n\n"
        "Final answer (one paragraph):"
    )
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": rag_prompt},
    return_source_documents=True
)

def chat_rag(q: str, k: int = 3):
    qa_chain.retriever.search_kwargs["k"] = k

    # idioma (EN por defecto, ES si hay se√±ales claras o force)
    lang = target_language(q)

    # limpia "Answer in English:" si lo usan
    q_clean = re.sub(r"(?i)^\s*answer in (english|spanish)\s*:\s*", "", q).strip()

    # üëá el tag viaja dentro del texto, sin cambiar inputs del chain
    tagged_q = f"[LANG={lang}] {q_clean}"

    res = qa_chain({"query": tagged_q})
    ans = postprocess_answer(res["result"])
    ans = hard_clean_and_cut(ans, lang)

    return ans

print("‚úÖ RAG ready.")


‚úÖ RAG ready.


In [37]:
def run_chat():
    print("ü§ñ RAG Chat ready. Type 'exit' to stop.\n")
    while True:
        q = input("You: ").strip()
        if q.lower() in {"exit", "quit", "q"}:
            print("Bot: Bye üëã")
            break
        ans = chat_rag(q, k=3)
        print("\nBot:", ans)
        print("\n" + "-"*60 + "\n")

run_chat()

ü§ñ RAG Chat ready. Type 'exit' to stop.



You:  Que es SEVIS?



Bot: SEVIS es un sistema inform√°tico que registra y monitorea las visitas de estudiantes internacionales en Estados Unidos. Es necesario para mantener la informaci√≥n de los estudiantes extranjeros y asegurar que cumplan con las leyes federales y universitarias. Los estudiantes internacionales deben completar el formulario SEVIS I-901 antes de llegar a Estados Unidos y actualizarlo cada a√±o.

------------------------------------------------------------



You:  What is SEVIS?



Bot: SEVIS is a government system that tracks international students. Your university must keep your information updated in it for your visa status to remain valid.

------------------------------------------------------------



You:  WHat is the I-20?



Bot: The I-20 form is issued by the university and is required to pay the SEVIS fee and apply for your F-1 student visa. It confirms your acceptance and legal eligibility for a student visa.

------------------------------------------------------------



You:  Que es el I-20?



Bot: El I-20 es un documento oficial emitido por tu universidad que demuestra tu admisi√≥n y tus fondos para estudiar en EE.UU. Es la prueba principal que necesitas para solicitar tu visa F-1 y entrar al pa√≠s. Contiene informaci√≥n importante sobre ti y el programa de estudios, incluyendo tu nombre, tu n√∫mero de SEVIS, el nombre de la universidad, tu fecha de inicio y finalizaci√≥n del programa, y una estimaci√≥n de los costos de matr√≠cula y de vida.

------------------------------------------------------------



You:  How do I apply to a US university?



Bot: The key steps are to research universities, prepare for standardized tests, write essays, gather documents, apply for a visa, and get ready for your arrival. It is a long process that requires advance planning.

------------------------------------------------------------



You:  COmo aplico a una universidad en los Estados Unidos?



Bot: Para aplicar a una universidad en los Estados Unidos, debes cumplir con los requisitos m√≠nimos de ingreso, incluyendo el TOEFL o IELTS si provienes de un pa√≠s no de habla inglesa. Investigar las pol√≠ticas de ayuda financiera de cada universidad es crucial para encontrar opciones de financiaci√≥n.

------------------------------------------------------------



You:  exit


Bot: Bye üëã
