In [None]:
# ============================================
# Module 10: Retrieval-Augmented Generation & Vector Search
# Lab 2 – Baseline LLM QA without RAG
# ============================================
# Author: Dr. Dasha Trofimova
# Course: M.Sc. Applied Data Science & AI
# --------------------------------------------
# Learning Goals:
# - Recognize the limitations of relying solely on LLM parametric memory
# - Compare factual accuracy with and without retrieval grounding
# - Experiment with prompt engineering to improve standalone reasoning
# --------------------------------------------
# Lab Objectives:
# 1. Use TinyLlama or another open-source LLM for direct QA
# 2. Ask factual and numerical questions to observe hallucinations
# 3. Compare responses against RAG-generated answers
# 4. Reflect on how retrieval grounding improves reliability
# ============================================
!pip install transformers accelerate bitsandbytes

In [None]:
import torch
from transformers import pipeline

# Pick your model
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

print("cuda available?", torch.cuda.is_available())
if torch.cuda.is_available():
    print("device:", torch.cuda.get_device_name(0))
else:
    print("running on CPU (this will be slower)")

# Create the generation pipeline
llm_pipeline = pipeline(
    task="text-generation",
    model=MODEL_NAME,
    tokenizer=MODEL_NAME,
    torch_dtype="auto",
    device_map="auto",      # GPU if available, CPU otherwise
)

print(f"Model loaded: {MODEL_NAME}")


In [5]:
MAX_NEW_TOKENS = 80   # limit how long answers can be
TEMPERATURE = 0.2     # a little sampling, but still controlled

def build_prompt(user_msg: str) -> str:
    """
    Build a clean prompt for TinyLlama.
    We keep the format simple and predictable.
    """
    prompt = (
        "You are a helpful, concise AI assistant.\n"
        "Answer the user clearly. Do not invent a new 'User:' turn.\n\n"
        f"User: {user_msg}\n"
        "Assistant:"
    )
    return prompt


def run_model(user_msg: str) -> str:
    """
    Send a single user message to the model and return only the assistant's answer.
    We also trim off extra self-chat the model might generate.
    """
    prompt = build_prompt(user_msg)

    result = llm_pipeline(
        prompt,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        do_sample=True,   # small models need some sampling to not freeze
        pad_token_id=llm_pipeline.tokenizer.eos_token_id,
    )

    full_text = result[0]["generated_text"]

    # We expect something like:
    # "You are a helpful... User: <question>\nAssistant: <answer> ... maybe more"
    # We'll pull just the assistant part.
    if "Assistant:" in full_text:
        answer_only = full_text.split("Assistant:", 1)[-1].strip()
    else:
        answer_only = full_text.strip()

    # Sometimes tiny models keep going and hallucinate a new "User:".
    if "User:" in answer_only:
        answer_only = answer_only.split("User:", 1)[0].strip()

    return answer_only


In [None]:
def chat_loop():
    print("\n=== TinyLlama Chat ===")
    print("Ask anything. Type 'exit' to stop.\n")

    while True:
        user_q = input("You: ")
        if user_q.lower() in ["exit", "quit"]:
            print("Bye 👋")
            break

        answer = run_model(user_q)
        print("Assistant:", answer, "\n")

chat_loop()
