In [23]:
import dotenv
import os
import json
import re
from datetime import datetime
from google import genai
from google.genai import types
import base64
import pathlib

dotenv.load_dotenv(".env")

GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY")
EXCHANGE_RATE = 310.13
LABELS_DIR = "labels"

In [24]:
client = genai.Client(http_options={'api_version': 'v1alpha'}, api_key=GEMINI_API_KEY)

In [25]:
def calculate_cost(response, exchange_rate):
    model = response.model_version
    match model:
        case 'gemini-3-flash-preview':
            prompt_tokens = response.usage_metadata.prompt_token_count
            candidates_tokens = response.usage_metadata.candidates_token_count
            thoughts_token_count = response.usage_metadata.thoughts_token_count

            IMPUT_PRICE = 0.5 / 1000000
            OUTPUT_PRICE = 3 / 1000000

            if thoughts_token_count:
                total_cost = (prompt_tokens) * IMPUT_PRICE + (candidates_tokens + thoughts_token_count) * OUTPUT_PRICE
            else:   
                total_cost = (prompt_tokens) * IMPUT_PRICE + (candidates_tokens) * OUTPUT_PRICE

            return {"model_id": model, "cost_USD": total_cost, "cost_LKR": total_cost * exchange_rate}
        case _:
            raise ValueError(f"Please update the cost calculation for this model: {model}")

In [26]:
def call_gemini(
        pdf_path,
        model,
        thinking_level: types.ThinkingLevel,
        temperature: float,
        top_p: float,
        media_resolution:types.MediaResolution,
        system_instruction: str,
        prompt: str
        ):
    if not pdf_path is None:
        filepath = pathlib.Path(pdf_path)
        parts=[
                types.Part(text=prompt),
                types.Part(
                    inline_data=types.Blob(
                        mime_type="application/pdf",
                        data=filepath.read_bytes(),
                    ),
                    media_resolution={"level": media_resolution}
                )
            ]
    else:
        parts=[
                types.Part(text=prompt)
            ]

    response = client.models.generate_content(
        model=model,
        config=types.GenerateContentConfig(
            systemInstruction=system_instruction,
            thinking_config=types.ThinkingConfig(thinking_level=thinking_level),
            temperature=temperature,
            top_p=top_p
        ),
        contents=[
            types.Content(
                parts=parts
            )
        ]
    )

    return response

In [27]:
SYSTEM_INSTRUCTION = "You are a experienced annotater who can identify student answers for a student's paper and complete the given JSON."

In [28]:
PROMPT = """
1. Read the uploaded document and extarct the answers and reshape according to the following structure.
2. For is is_illigible use "true" if the answer is redable and not empty otherwise "false". If empty, keep blank is_illigible: ""

Here is the Sturcture that you should complete:
```
{
    "paper_title": "",
    "questions": [
        {
            "test_number": "01",
            "instruction": "Fill in the blanks using the letters a-g.",
            "question_type": "Fill in the blanks",
            "student_answers": {
                "1": {
                    "answer": "",
                    "is_illigible": ""
                },
                "2": {
                    "answer": "",
                    "is_illigible": ""
                },
                "3": {
                    "answer": "",
                    "is_illigible": ""
                },
                "4": {
                    "answer": "",
                    "is_illigible": ""
                },
                "5": {
                    "answer": "",
                    "is_illigible": ""
                },
                "6": {
                    "answer": "",
                    "is_illigible": ""
                }
            }
        },
        {
            "test_number": "02",
            "instruction": "Choose the correct answer from the given two answers for each question.",
            "question_type": "Fill in the blanks",
            "student_answers": {
                "1": {
                    "answer": "",
                    "is_illigible": ""
                },
                "2": {
                    "answer": "",
                    "is_illigible": ""
                },
                "3": {
                    "answer": "",
                    "is_illigible": ""
                },
                "4": {
                    "answer": "",
                    "is_illigible": ""
                },
                "5": {
                    "answer": "",
                    "is_illigible": ""
                },
                "6": {
                    "answer": "",
                    "is_illigible": ""
                }
            }
        },
        {
            "test_number": "03",
            "instruction": "Fill in the blanks using appropriate nouns.",
            "question_type": "Fill in the blanks",
            "student_answers": {
                "1": {
                    "answer": "",
                    "is_illigible": ""
                },
                "2": {
                    "answer": "",
                    "is_illigible": ""
                },
                "3": {
                    "answer": "",
                    "is_illigible": ""
                },
                "4": {
                    "answer": "",
                    "is_illigible": ""
                },
                "5": {
                    "answer": "",
                    "is_illigible": ""
                },
                "6": {
                    "answer": "",
                    "is_illigible": ""
                },
                "7": {
                    "answer": "",
                    "is_illigible": ""
                },
                "8": {
                    "answer": "",
                    "is_illigible": ""
                },
                "9": {
                    "answer": "",
                    "is_illigible": ""
                },
                "10": {
                    "answer": "",
                    "is_illigible": ""
                }
            }
        },
        {
            "test_number": "04",
            "instruction": "Write correct letter (a-f) in the blank provided",
            "question_type": "Fill in the blanks",
            "student_answers": {
                "1": {
                    "answer": "",
                    "is_illigible": ""
                },
                "2": {
                    "answer": "",
                    "is_illigible": ""
                },
                "4": {
                    "answer": "",
                    "is_illigible": ""
                },
                "5": {
                    "answer": "",
                    "is_illigible": ""
                },
                "3": {
                    "answer": "",
                    "is_illigible": ""
                },
                "6": {
                    "answer": "",
                    "is_illigible": ""
                }
            }
        },
        {
            "test_number": "05",
            "instruction": "complete the dialogue using the given event poster.",
            "question_type": "Fill in the blanks",
            "student_answers": {
                "1": {
                    "answer": "",
                    "is_illigible": ""
                },
                "2": {
                    "answer": "",
                    "is_illigible": ""
                },
                "3": {
                    "answer": "",
                    "is_illigible": ""
                },
                "4": {
                    "answer": "",
                    "is_illigible": ""
                },
                "5": {
                    "answer": "",
                    "is_illigible": ""
                },
                "6": {
                    "answer": "",
                    "is_illigible": ""
                }
            }
        },
        {
            "test_number": "06",
            "instruction": "write a note",
            "question_type": "Essay",
            "student_answers": ""
        },
        {
            "test_number": "07",
            "instruction": "Read the given text and answer the questions",
            "question_type": "Question Answering",
            "student_answers": {
                "1": {
                    "answer": "",
                    "is_illigible": ""
                },
                "2": {
                    "answer": "",
                    "is_illigible": ""
                },
                "3": {
                    "answer": "",
                    "is_illigible": ""
                },
                "4": {
                    "answer": "",
                    "is_illigible": ""
                },
                "5": {
                    "answer": "",
                    "is_illigible": ""
                }
            }
        },
        {
            "test_number": "08",
            "instruction": "write one of the following topics",
            "question_type": "essay",
            "student_answers": {
                "answer": "",
                "is_illigible": ""
            }
        },
        {
            "test_number": "9",
            "instruction": "Identify the correct answer for each question.",
            "question_type": "Underline",
            "student_answers": {
                "1": {
                    "answer": "",
                    "is_illigible": ""
                },
                "2": {
                    "answer": "",
                    "is_illigible": ""
                },
                "3": {
                    "answer": "",
                    "is_illigible": ""
                },
                "4": {
                    "answer": "",
                    "is_illigible": ""
                },
                "5": {
                    "answer": "",
                    "is_illigible": ""
                }
            }
        },
        {
            "test_number": "10",
            "instruction": "write reported speech",
            "question_type": "essay",
            "student_answers": {
                "2": {
                    "answer": "",
                    "is_illigible": ""
                },
                "3": {
                    "answer": "",
                    "is_illigible": ""
                },
                "4": {
                    "answer": "",
                    "is_illigible": ""
                },
                "5": {
                    "answer": "",
                    "is_illigible": ""
                },
                "6": {
                    "answer": "",
                    "is_illigible": ""
                }
            }
        },
        {
            "test_number": "11",
            "instruction": "Fill in the blanks using the given words",
            "question_type": "Fill in the blanks",
            "student_answers": {
                "1": {
                    "answer": "",
                    "is_illigible": ""
                },
                "2": {
                    "answer": "",
                    "is_illigible": ""
                },
                "3": {
                    "answer": "",
                    "is_illigible": ""
                },
                "4": {
                    "answer": "",
                    "is_illigible": ""
                },
                "5": {
                    "answer": "",
                    "is_illigible": ""
                },
                "6": {
                    "answer": "",
                    "is_illigible": ""
                },
                "7": {
                    "answer": "",
                    "is_illigible": ""
                },
                "8": {
                    "answer": "",
                    "is_illigible": ""
                },
                "9": {
                    "answer": "",
                    "is_illigible": ""
                },
                "10": {
                    "answer": "",
                    "is_illigible": ""
                }
            }
        },
        {
            "test_number": "12",
            "instruction": "Answer the given questions (for questions (1) need underlining put the full answer underlined)",
            "question_type": "Question Answering",
            "student_answers": {
                "1": {
                    "answer": "",
                    "is_illigible": ""
                },
                "2": {
                    "answer": "",
                    "is_illigible": ""
                },
                "3": {
                    "answer": "",
                    "is_illigible": ""
                },
                "4": {
                    "answer": "",
                    "is_illigible": ""
                },
                "5": {
                    "a": {
                        "answer": "",
                        "is_illigible": ""
                    },
                    "b": {
                        "answer": "",
                        "is_illigible": ""
                    }
                }
            }
        },
        {
            "test_number": "13",
            "instruction": "Answer the questions (for questions (1) i) and ii) need underlining put the answer letter only)",
            "question_type": "Question Answering",
            "student_answers": {
                "1": {
                    "i": {
                        "answer": "",
                        "is_illigible": ""
                    },
                    "ii": {
                        "answer": "",
                        "is_illigible": ""
                    }
                },
                "2": {
                    "i": {
                        "answer": "",
                        "is_illigible": ""
                    },
                    "ii": {
                        "answer": "",
                        "is_illigible": ""
                    }
                },
                "3": {
                    "i": {
                        "answer": "",
                        "is_illigible": ""
                    },
                    "ii": {
                        "answer": "",
                        "is_illigible": ""
                    }
                },
                "4": {
                    "answer": "",
                    "is_illigible": ""
                },
                "5": {
                    "answer": "",
                    "is_illigible": ""
                }
            }
        },
        {
            "test_number": "",
            "instruction": "",
            "question_type": "",
            "student_answers": {
                "answer": "",
                "is_illigible": ""
            }
        }
    ]
}
```

Output must be only the completed JSON object.
"""

In [29]:
model = "gemini-3-flash-preview"
thinking_level = types.ThinkingLevel.MEDIUM
temperature = 1
top_p = 0.95
media_resolution = types.MediaResolution.MEDIA_RESOLUTION_HIGH

In [30]:
call_gemini_response = call_gemini(
    pdf_path="./data/set 1/4.pdf", 
    model=model, 
    thinking_level=thinking_level, 
    temperature=temperature, 
    top_p=top_p, 
    media_resolution=media_resolution, 
    system_instruction=SYSTEM_INSTRUCTION, 
    prompt=PROMPT
)
    
data = calculate_cost(
    response=call_gemini_response,
    exchange_rate=EXCHANGE_RATE
)

In [31]:
match = re.search(r'```json\s*(.*?)\s*```', call_gemini_response.text, re.DOTALL)
json_text = match.group(1)
pred = json.loads(json_text)

In [32]:
with open("./data/set 1/4.json", "r", encoding='utf-8') as f:
    gt = json.loads(f.read())

In [33]:
from difflib import SequenceMatcher

def word_diff(gt, pred):
    gt_words = gt.split()
    pred_words = pred.split()

    gt_words_lower = [w.lower() for w in gt_words]
    pred_words_lower = [w.lower() for w in pred_words]

    matcher = SequenceMatcher(None, gt_words_lower, pred_words_lower)
    diffs = []
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        # tag can be: 'replace', 'delete', 'insert', 'equal'
        if tag != 'equal':
            diffs.append((tag, gt_words[i1:i2], pred_words[j1:j2]))
    return diffs
    

In [34]:
def iterate_answers(gt_ans, pred_ans): 
    """Recursively iterate through nested answer structures"""
    if isinstance(gt_ans, dict) and "answer" in gt_ans: 
        yield gt_ans, pred_ans 
    elif isinstance(gt_ans, dict): 
        for k in gt_ans: 
            if isinstance(pred_ans, dict) and k in pred_ans: 
                yield from iterate_answers(gt_ans[k], pred_ans[k])


def calculate_hallucinations(gt, pred):
    """
    Calculate various types of hallucinations:
    1. Fabricated hallucinations (reading text where none exists)
    2. Word-level hallucinations (replacements and insertions)
    3. Crossed-out text hallucinations (reading crossed-out text)
    4. Illegibility hallucinations (claiming readable when illegible)
    """
    
    # Counters
    fabricated_hallucinations = 0
    crossed_out_hallucinations = 0
    illegibility_hallucinations = 0
    
    total_gt_words = 0
    total_hallucinated_words = 0
    
    replaced_word_pairs = []
    inserted_words = []
    
    # Create lookup dictionaries
    gt_questions = {q["test_number"]: q for q in gt["questions"]}
    pred_questions = {q["test_number"]: q for q in pred["questions"]}
    
    for tnum, gtq in gt_questions.items():
        if tnum not in pred_questions:
            continue
        
        predq = pred_questions[tnum]
        
        gt_ans = gtq["student_answers"]
        pred_ans = predq["student_answers"]
        
        # -------- Essay level hallucination --------
        if isinstance(gt_ans, str):
            # Fabricated hallucination: AI reads text where there is none
            if gt_ans == "" and pred_ans != "":
                fabricated_hallucinations += 1
            
            # Word-level hallucination for essays
            if isinstance(pred_ans, str) and gt_ans.strip() != "":
                diff = word_diff(gt_ans, pred_ans)
                
                for tag, gtw, prw in diff:
                    if tag == "replace" and gtw != prw:
                        replaced_word_pairs.append((gtw, prw))
                        total_hallucinated_words += 1
                    
                    elif tag == "insert" and prw != "":
                        inserted_words.append(prw)
                        total_hallucinated_words += 1
                
                total_gt_words += len(gt_ans.split())
            
            continue
        
        # -------- Structured QA hallucination --------
        for gtqa, predqa in iterate_answers(gt_ans, pred_ans):
            
            # 1. Fabricated hallucination
            if gtqa["answer"] == "" and predqa.get("answer", "") != "":
                fabricated_hallucinations += 1
            
            # 2. Crossed-out text hallucination
            # If GT has crossed_out_text, and prediction includes those words
            if gtqa.get("crossedout_text") and predqa.get("answer", ""):
                pred_answer_lower = predqa["answer"].lower()
                for crossed_word in gtqa["crossedout_text"]:
                    if crossed_word.lower() in pred_answer_lower:
                        crossed_out_hallucinations += 1
            
            # 3. Illegibility hallucination
            # GT says illegible (is_illigible = "false" or ""), but AI claims it's readable
            gt_illegible = gtqa.get("is_illigible", "").lower()
            pred_illegible = predqa.get("is_illigible", "").lower()
            
            # If ground truth says NOT readable (false or empty)
            # but prediction says readable (true) or provides an answer
            if gt_illegible in ["false", ""] and gtqa["answer"] == "":
                # If AI claims it's readable (is_illigible="true") or provides text
                if pred_illegible == "true" or predqa.get("answer", "") != "":
                    illegibility_hallucinations += 1
            
            # 4. Word-level hallucination (for readable text)
            if gtqa["answer"] != "" and predqa.get("answer", "") != "":
                diff = word_diff(gtqa["answer"], predqa["answer"])
                
                for tag, gtw, prw in diff:
                    if tag == "replace" and gtw != prw:
                        replaced_word_pairs.append((gtw, prw))
                        total_hallucinated_words += 1
                    
                    elif tag == "insert" and prw != "":
                        inserted_words.append(prw)
                        total_hallucinated_words += 1
                
                total_gt_words += len(gtqa["answer"].split())
    
    # -------- Calculate rates --------
    hallucination_rate = (
        total_hallucinated_words / total_gt_words
        if total_gt_words > 0 else 0
    )
    
    # -------- Print results --------
    print("=" * 60)
    print("HALLUCINATION ANALYSIS REPORT")
    print("=" * 60)
    
    print(f"\n1. Fabricated hallucinations: {fabricated_hallucinations}")
    print(f"   (Reading text where none exists)")
    
    print(f"\n2. Crossed-out text hallucinations: {crossed_out_hallucinations}")
    print(f"   (Reading text that was crossed out)")
    
    print(f"\n3. Illegibility hallucinations: {illegibility_hallucinations}")
    print(f"   (Claiming readable when actually illegible)")
    
    print(f"\n4. Word-level hallucination rate: {hallucination_rate:.2%}")
    print(f"   ({total_hallucinated_words}/{total_gt_words} words)")
    
    print("\n" + "=" * 60)
    print("DETAILED WORD-LEVEL ERRORS")
    print("=" * 60)
    
    print("\n--- Replaced Words ---")
    for gtw, prw in replaced_word_pairs[:20]:  # Show first 20
        print(f"  {gtw}  →  {prw}")
    if len(replaced_word_pairs) > 20:
        print(f"  ... and {len(replaced_word_pairs) - 20} more")
    
    print("\n--- Inserted (Hallucinated) Words ---")
    for w in inserted_words[:20]:  # Show first 20
        print(f"  {w}")
    if len(inserted_words) > 20:
        print(f"  ... and {len(inserted_words) - 20} more")
    
    # Return summary
    return {
        "fabricated_hallucinations": fabricated_hallucinations,
        "crossed_out_hallucinations": crossed_out_hallucinations,
        "illegibility_hallucinations": illegibility_hallucinations,
        "word_level_hallucination_rate": hallucination_rate,
        "total_hallucinated_words": total_hallucinated_words,
        "total_gt_words": total_gt_words,
        "replaced_word_pairs": replaced_word_pairs,
        "inserted_words": inserted_words
    }


results = calculate_hallucinations(gt, pred)

HALLUCINATION ANALYSIS REPORT

1. Fabricated hallucinations: 1
   (Reading text where none exists)

2. Crossed-out text hallucinations: 0
   (Reading text that was crossed out)

3. Illegibility hallucinations: 1
   (Claiming readable when actually illegible)

4. Word-level hallucination rate: 15.66%
   (13/83 words)

DETAILED WORD-LEVEL ERRORS

--- Replaced Words ---
  ['e']  →  ['g']
  ['g']  →  ['e']
  ['c']  →  ['f']
  ['f']  →  ['c']
  ['rrings']  →  ['rings']
  ['b']  →  ['in', 'a', 'classroom']
  ['ike', 'most.I']  →  ['like', 'most.', 'I']
  ['members']  →  ['members.']
  ['lovelies.']  →  ['lovelies']
  ['mour']  →  ['in', 'our']
  ['hearest,']  →  ['nearest,']
  ['bikth.']  →  ['birth.']

--- Inserted (Hallucinated) Words ---
  ['was']


In [35]:
results

{'fabricated_hallucinations': 1,
 'crossed_out_hallucinations': 0,
 'illegibility_hallucinations': 1,
 'word_level_hallucination_rate': 0.1566265060240964,
 'total_hallucinated_words': 13,
 'total_gt_words': 83,
 'replaced_word_pairs': [(['e'], ['g']),
  (['g'], ['e']),
  (['c'], ['f']),
  (['f'], ['c']),
  (['rrings'], ['rings']),
  (['b'], ['in', 'a', 'classroom']),
  (['ike', 'most.I'], ['like', 'most.', 'I']),
  (['members'], ['members.']),
  (['lovelies.'], ['lovelies']),
  (['mour'], ['in', 'our']),
  (['hearest,'], ['nearest,']),
  (['bikth.'], ['birth.'])],
 'inserted_words': [['was']]}

In [36]:
SYSTEM_INSTRUCTION = "You are very good at detecting hallucinations in student's answers."

In [37]:
PROMPT = f"""
Here there is the hallucination report but the word level error is progamatically calculated. So, the some of the predictions which are not semms to be  a hallucination is categorized as word level error, then the word level hallucination rate is got increased. So, I need you to correct the Word-level hallucination rate and DETAILED WORD-LEVEL ERRORS.

REPORT:
```
{results}
```

Output should be same as given REPORT but with correct word level hallucination rate and DETAILED WORD-LEVEL ERRORS.
Only output the corrected report in json format nothing else.
"""

In [38]:
call_gemini_response_refine = call_gemini(
    pdf_path=None, 
    model=model, 
    thinking_level=types.ThinkingLevel.MEDIUM, 
    temperature=1, 
    top_p=0.95, 
    media_resolution=media_resolution, 
    system_instruction=SYSTEM_INSTRUCTION, 
    prompt=PROMPT
)

In [39]:
print(call_gemini_response_refine.text)

```json
{
  "fabricated_hallucinations": 1,
  "crossed_out_hallucinations": 0,
  "illegibility_hallucinations": 1,
  "word_level_hallucination_rate": 0.024096385542168676,
  "total_hallucinated_words": 2,
  "total_gt_words": 83,
  "replaced_word_pairs": [
    [
      ["b"],
      ["in", "a", "classroom"]
    ]
  ],
  "inserted_words": [
    ["was"]
  ]
}
```


In [40]:
match_refine = re.search(r'```json\s*(.*?)\s*```', call_gemini_response_refine.text, re.DOTALL)
json_text_refine = match_refine.group(1)
result_refined = json.loads(json_text_refine)

In [41]:
result_refined

{'fabricated_hallucinations': 1,
 'crossed_out_hallucinations': 0,
 'illegibility_hallucinations': 1,
 'word_level_hallucination_rate': 0.024096385542168676,
 'total_hallucinated_words': 2,
 'total_gt_words': 83,
 'replaced_word_pairs': [[['b'], ['in', 'a', 'classroom']]],
 'inserted_words': [['was']]}

In [42]:
pred

{'paper_title': 'Grade 10 - English - First Term Evaluation - 2018 - Department of Education - Western Province',
 'questions': [{'test_number': '01',
   'instruction': 'Fill in the blanks using the letters a-g.',
   'question_type': 'Fill in the blanks',
   'student_answers': {'1': {'answer': 'd', 'is_illigible': 'true'},
    '2': {'answer': 'g', 'is_illigible': 'true'},
    '3': {'answer': 'c', 'is_illigible': 'true'},
    '4': {'answer': 'e', 'is_illigible': 'true'},
    '5': {'answer': 'a', 'is_illigible': 'true'},
    '6': {'answer': 'f', 'is_illigible': 'true'}}},
  {'test_number': '02',
   'instruction': 'Choose the correct answer from the given two answers for each question.',
   'question_type': 'Fill in the blanks',
   'student_answers': {'1': {'answer': "didn't", 'is_illigible': 'true'},
    '2': {'answer': 'was', 'is_illigible': 'true'},
    '3': {'answer': 'was suffering', 'is_illigible': 'true'},
    '4': {'answer': 'is', 'is_illigible': 'true'},
    '5': {'answer': 'asks