## Statistic Significance for Rewrites Rating

In [1]:
import pandas as pd
from scipy.stats import friedmanchisquare, wilcoxon
from statsmodels.stats.multitest import multipletests

# Load data
df = pd.read_csv("Rating_rewrites.csv")

# Pivot so each row = Query ID, columns = Original/Rewrite1...Rewrite5
pivot_df = df.pivot(index="Query ID", columns="Query Type", values="Query Score")

# Drop rows with missing values (if any queries are incomplete)
pivot_df = pivot_df.dropna()

# --- 1. Friedman Test (global test across all query types) ---
scores = [pivot_df[col].values for col in pivot_df.columns]
friedman_stat, friedman_p = friedmanchisquare(*scores)
print("Friedman test:")
print(f"  statistic={friedman_stat:.4f}, p-value={friedman_p:.6f}")

# --- 2. Pairwise Wilcoxon tests (Original vs each Rewrite) ---
results = []
for col in pivot_df.columns:
    if col.lower() != "original":  # skip self-comparison
        stat, p = wilcoxon(pivot_df["Original"], pivot_df[col])
        results.append((col, stat, p))

# Correct for multiple testing (Bonferroni or FDR)
cols, stats, pvals = zip(*results)
reject, pvals_corrected, _, _ = multipletests(pvals, method="bonferroni")

# Save pairwise results
pairwise_df = pd.DataFrame({
    "Comparison": [f"Original vs {c}" for c in cols],
    "Wilcoxon_Statistic": stats,
    "Raw_p": pvals,
    "Corrected_p": pvals_corrected,
    "Significant": reject
})
pairwise_df.to_csv("query_score_significance.csv", index=False)

print("\nPairwise Wilcoxon tests (Original vs Rewrites):")
print(pairwise_df)


Friedman test:
  statistic=955.5232, p-value=0.000000

Pairwise Wilcoxon tests (Original vs Rewrites):
              Comparison  Wilcoxon_Statistic         Raw_p   Corrected_p  \
0  Original vs Rewrite 1              9395.5  2.756789e-45  1.378394e-44   
1  Original vs Rewrite 2              7340.0  2.518162e-50  1.259081e-49   
2  Original vs Rewrite 3              1172.0  4.949241e-67  2.474620e-66   
3  Original vs Rewrite 4              6638.0  4.497181e-52  2.248591e-51   
4  Original vs Rewrite 5              2210.0  1.139091e-62  5.695456e-62   

   Significant  
0         True  
1         True  
2         True  
3         True  
4         True  


# Intent Match and Answer Qualty

In [None]:
Intent Match


In [None]:
import pandas as pd
import numpy as np
from scipy.stats import friedmanchisquare, wilcoxon
import itertools
from statsmodels.stats.multitest import multipletests


OUTPUT_FILE = "intent_match_significance.csv"
df = pd.read_csv("evaluated_queries.csv")
# Filter only rewrites
rewrites = df[df["Query Type"].str.contains("Rewrite")]

# Pivot so each row = query, each column = a rewrite’s intent score
pivot = rewrites.pivot(index="Query ID", columns="Query Type", values="Intent Match")

# ---- Friedman Test ----
stat, p = friedmanchisquare(*[pivot[col] for col in pivot.columns])
print(f"Friedman test: statistic={stat:.4f}, p-value={p:.6f}")

# ---- Pairwise Wilcoxon Tests ----
comparisons = list(itertools.combinations(pivot.columns, 2))
results = []
for c1, c2 in comparisons:
    stat, pval = wilcoxon(pivot[c1], pivot[c2])
    results.append([f"{c1} vs {c2}", stat, pval])

# Correct p-values (Holm)
comparisons_df = pd.DataFrame(results, columns=["Comparison","Wilcoxon_Statistic","Raw_p"])
comparisons_df["Corrected_p"] = multipletests(comparisons_df["Raw_p"], method="holm")[1]
comparisons_df["Significant"] = comparisons_df["Corrected_p"] < 0.05

print(comparisons_df)
comparisons_df.to_csv(OUTPUT_FILE, index=False)


# Answer Quality

In [None]:
import pandas as pd
from scipy.stats import friedmanchisquare, wilcoxon
from statsmodels.stats.multitest import multipletests
from itertools import combinations

# --- Load data ---
df = pd.read_csv("evaluated_queries.csv")

# Pivot so each row = Query ID, columns = Original/Rewrite1...Rewrite5
pivot_df = df.pivot(index="Query ID", columns="Query Type", values="Answer Quality")

# Drop rows with missing values (if any queries are incomplete)
pivot_df = pivot_df.dropna()

# --- 1. Friedman Test (global test across all query types) ---
scores = [pivot_df[col].values for col in pivot_df.columns]
friedman_stat, friedman_p = friedmanchisquare(*scores)
print("Friedman test:")
print(f"  statistic={friedman_stat:.4f}, p-value={friedman_p:.6f}")

# --- 2. All Pairwise Wilcoxon tests ---
results = []
for c1, c2 in combinations(pivot_df.columns, 2):
    try:
        stat, p = wilcoxon(pivot_df[c1], pivot_df[c2])
        results.append((f"{c1} vs {c2}", stat, p))
    except ValueError:
        # Happens if the two columns are identical (all differences = 0)
        results.append((f"{c1} vs {c2}", None, 1.0))

# --- Multiple testing correction (Holm is usually better than Bonferroni) ---
comparisons, stats, pvals = zip(*results)
reject, pvals_corrected, _, _ = multipletests(pvals, method="holm")

# --- Save pairwise results ---
pairwise_df = pd.DataFrame({
    "Comparison": comparisons,
    "Wilcoxon_Statistic": stats,
    "Raw_p": pvals,
    "Corrected_p": pvals_corrected,
    "Significant": reject
})

# Round for readability
pairwise_df = pairwise_df.round(6)

# Save to CSV
pairwise_df.to_csv("answer_quality_significance.csv", index=False)

print("\nPairwise Wilcoxon tests (all query type comparisons):")
print(pairwise_df)


In [13]:
import pandas as pd
import ollama
import time
import re
import tqdm
import os

# Load CSV
INPUT_FILE = 'Query_statistics.csv'
OUTPUT_FILE = 'evaluated queries.csv'
OLLAMA_MODEL = 'mistral'
BATCH_SIZE = 300  # Number of rows to process at a time

if os.path.exists(OUTPUT_FILE):
    df = pd.read_csv(OUTPUT_FILE)  # Resume from saved file
    print(f"[INFO] Resuming from {OUTPUT_FILE}")
else:
    df = pd.read_csv(INPUT_FILE)
    # Add new columns for final scores
    df['Query Score'] = None
    print("[INFO] Starting fresh evaluation.")

# Process only rows where Query Type != "Original" and not yet scored
rows_to_process = df[(df['Query Score'].isna())].head(BATCH_SIZE)

if rows_to_process.empty:
    print("[INFO] No more rows to process.")
    exit()


# Define evaluation prompt templates
def build_prompt(row):
    return f"""
You are evaluating a refined query and its answer.

Original User Query: {row['Original Query']}
Refined Query: {row['Refined Query']}
LLM Answer: {row['LLM Answer']}

1. Rate how well the **refined query** matches the intent of the original query (from 1 to 5).
2. Rate how well the **LLM Answer** answers the intent of the user (from 1 to 5).

Return your result in JSON format like:
{{"intent_match": x, "answer_quality": y}}
    """.strip()

# Store results
intent_scores = []
answer_scores = []

for idx in tqdm(rows_to_process.index, total=len(rows_to_process)):
    prompt = build_prompt(row)

    try:
        response = ollama.chat(
            model=OLLAMA_MODEL,
            messages=[
                {"role": "system", "content": "You are an honest and concise evaluator. Return only JSON."},
                {"role": "user", "content": prompt}
            ]
        )
        content = response["message"]["content"].strip()
        print(f"\nRow {idx} response:\n{content}\n")

        # Extract numbers using regex
        intent_match = re.search(r'"intent_match"\s*:\s*(\d)', content)
        answer_quality = re.search(r'"answer_quality"\s*:\s*(\d)', content)

        # Save directly into dataframe
        df.loc[idx, "Intent Match (1-5)"] = float(intent_match.group(1)) if intent_match else None
        df.loc[idx, "Answer Quality (1-5)"] = float(answer_quality.group(1)) if answer_quality else None

    except Exception as e:
        print(f"[ERROR] Row {idx}: {e}")
        df.loc[idx, "Intent Match (1-5)"] = None
        df.loc[idx, "Answer Quality (1-5)"] = None

    time.sleep(DELAY)

# ==========================
# SAVE PROGRESS
# ==========================
df.to_csv(OUTPUT_FILE, index=False)
print(f"[INFO] Evaluation complete. Progress saved to {OUTPUT_FILE}")


[INFO] Starting fresh evaluation.


TypeError: 'module' object is not callable

In [12]:
!pip install tqdm




In [5]:
import pandas as pd
import ollama
import json
import time

# ========================
# CONFIGURATION
# ========================
INPUT_FILE = "Query_statistics.csv"
OUTPUT_FILE = "evaluated_queries.csv"
EVALUATOR_MODEL = "mistral"   # <- use different model from generator
BATCH_SIZE = 300
DELAY = 2  # seconds between batches, to avoid overloading

# ========================
# PROMPT TEMPLATE
# ========================
def build_batch_prompt(batch):
    """Builds a single evaluation prompt for multiple queries in one batch"""
    prompt = """
You are an impartial evaluator. 
Rate refined queries and answers according to the following scale:

- 1 = very poor
- 2 = poor
- 3 = fair
- 4 = good
- 5 = excellent

For each case, provide two ratings:
1. intent_match → how well the refined query matches the intent of the original
2. answer_quality → how well the LLM answer addresses the user’s intent

⚠️ Return ONLY valid JSON (list of objects). 
Do not add explanations, comments, or text outside JSON.
""".strip()

    cases = []
    for idx, row in batch.iterrows():
        cases.append({
            "id": int(idx),
            "original_query": row["Original Query"],
            "refined_query": row["Refined Query"],
            "llm_answer": row["LLM Answer"]
        })

    prompt += f"\n\nEvaluate these cases:\n{json.dumps(cases, indent=2)}\n\n"
    prompt += """Return your answer as a JSON list, where each element corresponds to a case:
[
  {"id": <row_id>, "intent_match": <1-5>, "answer_quality": <1-5>},
  ...
]
"""
    return prompt


# ========================
# MAIN LOOP
# ========================
def main():
    df = pd.read_csv(INPUT_FILE)
    results = []

    for start in range(0, len(df), BATCH_SIZE):
        batch = df.iloc[start:start + BATCH_SIZE]
        print(f"Processing batch {start} – {start + len(batch) - 1}")

        prompt = build_batch_prompt(batch)

        try:
            response = ollama.chat(
                model=EVALUATOR_MODEL,
                messages=[
                    {"role": "system", "content": "You are a strict JSON-only evaluator."},
                    {"role": "user", "content": prompt}
                ]
            )

            content = response["message"]["content"].strip()
            # Try parsing JSON safely
            parsed = None
            try:
                parsed = json.loads(content)
            except json.JSONDecodeError:
                # Sometimes models wrap output in ```json ... ```
                if "```" in content:
                    cleaned = content.split("```")[1]
                    cleaned = cleaned.replace("json", "").strip()
                    parsed = json.loads(cleaned)

            if parsed is None or not isinstance(parsed, list):
                raise ValueError("Evaluator did not return a JSON list")

            results.extend(parsed)

        except Exception as e:
            print(f"[ERROR] Failed batch {start}: {e}")
            # Fill missing rows with None
            for idx in batch.index:
                results.append({"id": int(idx), "intent_match": None, "answer_quality": None})

        time.sleep(DELAY)

    # Merge results back into dataframe
    res_df = pd.DataFrame(results).set_index("id")
    df = df.join(res_df, how="left")
    df.to_csv(OUTPUT_FILE, index=False)

    print(f"✅ Evaluation complete. Saved as {OUTPUT_FILE}")


if __name__ == "__main__":
    main()


Processing batch 0 – 299
Processing batch 300 – 599
Processing batch 600 – 899
Processing batch 900 – 1199
Processing batch 1200 – 1499
Processing batch 1500 – 1799
Processing batch 1800 – 2099
Processing batch 2100 – 2399
Processing batch 2400 – 2699
Processing batch 2700 – 2999
Processing batch 3000 – 3299
Processing batch 3300 – 3599
Processing batch 3600 – 3899
Processing batch 3900 – 4199
Processing batch 4200 – 4499
Processing batch 4500 – 4799
Processing batch 4800 – 5099
Processing batch 5100 – 5399
Processing batch 5400 – 5699
Processing batch 5700 – 5999
Processing batch 6000 – 6035
✅ Evaluation complete. Saved as evaluated_queries.csv
