In [None]:
"""
Complete runnable script:
- Loads finalDataset.csv
- Generates a "Corrected Question" style rewrite using Qwen2.5-1.5B-Instruct
- Saves incremental results to `corrected_questions_qwen.csv`
- Resumes if output file already exists
Notes:
- If you have a GPU, it will use it. If not, it runs on CPU (slower).
- Install required packages if missing: transformers, torch, pandas
"""

import os
import time
import traceback
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# -----------------------
# Config
# -----------------------
FINAL_DATASHEET = "finalDataset.csv"                     # input file (your file)
OUTPUT_FILE = "corrected_questions_qwen.csv"             # output file
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"                # small Qwen model (works locally)
SAVE_EVERY = 20                                          # save after this many rows
MAX_NEW_TOKENS = 150
DO_SAMPLE = False
TEMPERATURE = 0.0
# -----------------------

# 0. Basic checks
if not os.path.exists(FINAL_DATASHEET):
    raise FileNotFoundError(f"Input file not found: {FINAL_DATASHEET}")

# 1. Load dataset
data = pd.read_csv(FINAL_DATASHEET)
print("Loaded dataset:", FINAL_DATASHEET, "shape =", data.shape)
print("Columns:", list(data.columns))

# Determine which input column to use (prefer 'Corrected Question' if you want original corrected text,
# otherwise use the raw 'question' column to rewrite into a corrected question)
INPUT_COL = "Corrected Question" if "Corrected Question" in data.columns else "question"
print("Using input column for generation:", INPUT_COL)

# Prepare output columns if not present
if "Qwen_Corrected_Question" not in data.columns:
    data["Qwen_Corrected_Question"] = ""

if "Qwen_Generation_Error" not in data.columns:
    data["Qwen_Generation_Error"] = ""

# If output exists already (resume), load it and skip processed rows
if os.path.exists(OUTPUT_FILE):
    print("Found existing output file. Loading to resume...")
    existing = pd.read_csv(OUTPUT_FILE)
    # if columns match, adopt existing results to resume
    if "Qwen_Corrected_Question" in existing.columns:
        # overwrite the data rows we have results for
        existing_idx = existing.index
        data.loc[existing_idx, "Qwen_Corrected_Question"] = existing["Qwen_Corrected_Question"]
        if "Qwen_Generation_Error" in existing.columns:
            data.loc[existing_idx, "Qwen_Generation_Error"] = existing["Qwen_Generation_Error"]
        print("Resumed from existing results. Rows already processed:", existing["Qwen_Corrected_Question"].notna().sum())

# 2. Device detection
has_cuda = torch.cuda.is_available()
device_str = "cuda" if has_cuda else "cpu"
print("Device:", device_str, "| CUDA available:", has_cuda)

# 3. Load model & tokenizer
print(f"Loading model {MODEL_NAME} ... (this may take a bit)")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    # choose dtype
    torch_dtype = torch.float16 if has_cuda else None

    # If GPU available, let transformers auto map; else load to CPU
    if has_cuda:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            device_map="auto",
            torch_dtype=torch_dtype,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
        )
        # create pipeline with device 0 (first GPU)
        text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0, trust_remote_code=True)
    else:
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            device_map="cpu",
            torch_dtype=torch_dtype,
            low_cpu_mem_usage=True,
            trust_remote_code=True,
        )
        text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1, trust_remote_code=True)

    print("Model & tokenizer loaded.")
except Exception as e:
    print("Failed to load model/tokenizer. Error:")
    traceback.print_exc()
    raise e

# 4. Helper: build prompt and generate
def build_prompt_from_row(row):
    # Use available columns safely (some rows may have NaN)
    q = str(row.get("question", "") or "")
    a = str(row.get("answer", "") or "")
    sf = str(row.get("supporting_facts", "") or "")
    gp = str(row.get("graph_path", "") or "")
    tp = str(row.get("type", "") or "")
    src = str(row.get("source", "") or "")
    prob = str(row.get("prob_yes", "") or "")

    prompt = f"""
You are given the following information from a dataset. Use the details to produce a clear, concise, self-contained "Corrected Question" that can be answered independently.

Question: {q}
Answer: {a}
Supporting Facts: {sf}
Graph Path: {gp}
Type: {tp}
Source: {src}
Probability Yes: {prob}

Task: Rewrite the "Corrected Question" so it is clear, self-contained, and refers to the answer and supporting facts properly.
Corrected Question:
"""
    return prompt.strip()

def generate_corrected_question_for_row(row, retry=1):
    prompt = build_prompt_from_row(row)
    try:
        # pipeline returns list of dicts
        out = text_gen(prompt, max_new_tokens=MAX_NEW_TOKENS, do_sample=DO_SAMPLE, temperature=TEMPERATURE)
        gen_text = out[0]["generated_text"]
        # Sometimes the pipeline returns prompt+generation; try to split
        if "Corrected Question:" in gen_text:
            gen = gen_text.split("Corrected Question:")[-1].strip()
        else:
            # fallback: take entire generated text and remove prompt if present
            gen = gen_text.strip()
            if gen.startswith(prompt):
                gen = gen[len(prompt):].strip()
        # Post-clean common artifacts
        # Trim to first line or sentence to keep it concise
        gen = gen.split("\n")[0].strip()
        # ensure not empty
        if len(gen) == 0:
            raise ValueError("Empty generation")
        return gen, ""
    except Exception as e:
        if retry > 0:
            time.sleep(1.0)
            return generate_corrected_question_for_row(row, retry=retry-1)
        else:
            err = f"{type(e).__name__}: {str(e)}"
            return "", err

# 5. Main loop with incremental saving
total = len(data)
print("Beginning generation for", total, "rows.")
start_idx = 0
processed = 0
last_save = 0

for idx in range(start_idx, total):
    # Skip if already has a result
    if isinstance(data.at[idx, "Qwen_Corrected_Question"], str) and data.at[idx, "Qwen_Corrected_Question"].strip():
        # already done
        continue

    row = data.iloc[idx]
    try:
        gen, err = generate_corrected_question_for_row(row)
        if err:
            print(f"[{idx+1}/{total}] ERROR -> {err}")
            data.at[idx, "Qwen_Corrected_Question"] = ""
            data.at[idx, "Qwen_Generation_Error"] = err
        else:
            data.at[idx, "Qwen_Corrected_Question"] = gen
            data.at[idx, "Qwen_Generation_Error"] = ""
            print(f"[{idx+1}/{total}] OK -> {gen}")

    except KeyboardInterrupt:
        print("Interrupted by user. Saving progress before exit...")
        data.to_csv(OUTPUT_FILE, index=False)
        raise
    except Exception as e:
        tb = traceback.format_exc()
        print(f"[{idx+1}/{total}] Unexpected error:\n{tb}")
        data.at[idx, "Qwen_Corrected_Question"] = ""
        data.at[idx, "Qwen_Generation_Error"] = f"Unexpected: {type(e).__name__}"

    processed += 1
    # Save periodically
    if (processed - last_save) >= SAVE_EVERY:
        print(f"...saving progress to {OUTPUT_FILE} (processed {processed} rows)...")
        data.to_csv(OUTPUT_FILE, index=False)
        last_save = processed

# Final save
data.to_csv(OUTPUT_FILE, index=False)
print("All done. Results saved to:", OUTPUT_FILE)


# 2


In [None]:
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, logging

logging.set_verbosity_error()  # suppress warnings

# -----------------------
# Config
# -----------------------
FINAL_DATASHEET = "finalDataset.csv"          # Input CSV
OUTPUT_FILE = "Qwen_Inference_Results.csv"   # Output CSV
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"    # Model
MAX_NEW_TOKENS = 150
DO_SAMPLE = False
TEMPERATURE = 0.0
SAVE_EVERY = 20
# -----------------------

# 0. Check file
if not os.path.exists(FINAL_DATASHEET):
    raise FileNotFoundError(f"Input file not found: {FINAL_DATASHEET}")

# 1. Load dataset
data = pd.read_csv(FINAL_DATASHEET)
print("Loaded dataset:", FINAL_DATASHEET, "shape =", data.shape)

# 2. Device setup
device = 0 if torch.cuda.is_available() else -1
device_str = "cuda" if device == 0 else "cpu"
print("Device:", device_str)

# 3. Load model & tokenizer
print(f"Loading model {MODEL_NAME} ...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto" if device == 0 else "cpu",
    torch_dtype=torch.float16 if device == 0 else None,
    trust_remote_code=True,
    low_cpu_mem_usage=True
)
text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer, trust_remote_code=True)
print("Model loaded.")

# 4. Prepare output columns
if "model_answers" not in data.columns:
    data["model_answers"] = ""
if "Correctness" not in data.columns:
    data["Correctness"] = ""

# 5. Helper: build prompt to answer question
def build_answer_prompt(row):
    prompt = f"""
You are a person from India with deep knowledge and lived experience of Indian culture.
Now, answer the following question using your expertise in Indian culture by identifying the specific cultural element being referred to.
Respond only with the name of the cultural element (e.g., Indian) — no additional text, questions, or explanations.

Question: {row['question']}
"""
    return prompt.strip()


# 6. Main loop
total = len(data)
processed = 0
last_save = 0
correct_count = 0

for idx in range(total):
    if isinstance(data.at[idx, "model_answers"], str) and data.at[idx, "model_answers"].strip():
        continue  # skip already processed

    row = data.iloc[idx]
    prompt = build_answer_prompt(row)

    try:
        # Generate answer
        out = text_gen(prompt, max_new_tokens=MAX_NEW_TOKENS, do_sample=DO_SAMPLE, temperature=TEMPERATURE)
        gen_text = out[0]["generated_text"].strip()

        # Optional: clean if prompt is repeated
        if prompt in gen_text:
            gen_text = gen_text.replace(prompt, "").strip()

        data.at[idx, "model_answers"] = gen_text

        # Check correctness
        correct_answer = str(row.get("answer", "")).strip().lower()
        if correct_answer in gen_text.lower():
            data.at[idx, "Correctness"] = True
            correct_count += 1
        else:
            data.at[idx, "Correctness"] = False

        processed += 1
        print(f"[{idx+1}/{total}] Answer: {gen_text} | Correct so far: {correct_count}")

    except Exception as e:
        print(f"[{idx+1}/{total}] ERROR -> {e}")
        data.at[idx, "model_answers"] = ""
        data.at[idx, "Correctness"] = False

    # Save periodically
    if (processed - last_save) >= SAVE_EVERY:
        print(f"...saving progress to {OUTPUT_FILE} (processed {processed} rows)")
        data.to_csv(OUTPUT_FILE, index=False)
        last_save = processed

# Final save
data.to_csv(OUTPUT_FILE, index=False)
print("Inference done. Correct answers:", correct_count)
print("Results saved to:", OUTPUT_FILE)
