In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# -------------------------------------------------------------------------
# 1️⃣ Configuration
# -------------------------------------------------------------------------
BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct" 
LORA_MODEL_DIR = "./questionGenerationModel"  # folder where you saved LoRA weights

# Use CPU (you can set to "cuda" if you have a GPU)
device = torch.device("cpu")

# -------------------------------------------------------------------------
# 2️⃣ Load tokenizer and models
# -------------------------------------------------------------------------
print("Loading tokenizer and models...")

tokenizer = AutoTokenizer.from_pretrained(LORA_MODEL_DIR)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16, 
    device_map={"": "cpu"}
)

# Merge LoRA weights with the base model
model = PeftModel.from_pretrained(base_model, LORA_MODEL_DIR)
model = model.to(device)
model.eval()

print("Model and tokenizer loaded successfully!")

# -------------------------------------------------------------------------
# 3️⃣ Define helper function for generation
# -------------------------------------------------------------------------
def generate_response(prompt: str, max_new_tokens: int = 200):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# -------------------------------------------------------------------------
# 4️⃣ Single test prompt
# -------------------------------------------------------------------------
print("\n=== Single Prompt Test ===")
prompt = """Topic: Artificial Intelligence, Difficulty: Hard. Generate a technical interview question."""
response = generate_response(prompt)
print(response)

# -------------------------------------------------------------------------
# 5️⃣ Interactive mode (optional)
# -------------------------------------------------------------------------
print("\n=== Interactive Chat Mode (Ctrl+C to quit) ===")
while True:
    try:
        topic = input("\nEnter a topic (or 'exit'): ").strip()
        if topic.lower() == "exit":
            break
        difficulty = input("Enter difficulty (Easy / Medium / Hard): ").strip()
        
        chat_prompt = f"""
        You are an expert technical interviewer.

        Generate ONE high-quality technical interview question based on:
        - Topic: {topic}
        - Difficulty: {difficulty}
        
        Also give an ideal answer
        """

        result = generate_response(chat_prompt)
        print("\n--- Model Output ---")
        print(result)
    except KeyboardInterrupt:
        print("\nExiting chat...")
        break


: 