In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Load required libraries
from transformers import AutoModelForCausalLM, AutoTokenizer
import json
import time
import numpy as np
import torch

In [3]:
# Load model and tokenizer
model_path = "/content/drive/Shareddrives/517 nlp project/Models/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Ensure the tokenizer has a pad_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

In [4]:
def generate_qa_prompt(question, context):
    prompt = f"""<s>[INST] You are a helpful AI assistant. Read the following context and answer the question accurately and concisely.

Context:
{context}

Question: {question}

Answer the question based only on the context provided. [/INST]"""
    return prompt

def normalize_answer(answer):
    """Normalize answer string for comparison"""
    if answer is None:
        return ""
    # Remove punctuation and convert to lowercase
    return str(answer).lower().strip().replace('.', '').replace(',', '')

In [9]:
# Load evaluation dataset
dataset_path = "/content/drive/Shareddrives/517 nlp project/data/2WikiMultihopQA/test.json"
with open(dataset_path, 'r') as f:
    eval_data = json.load(f)

# Limit to 300 samples
eval_data = eval_data[:300]

print(f"Total evaluation examples: {len(eval_data)}")

Total evaluation examples: 300


In [10]:
# Run evaluation
correct_answers = 0
total_questions = len(eval_data)
start_time = time.time()

# ANSI color codes for output formatting
BLUE = '\033[94m'
GREEN = '\033[92m'
RED = '\033[91m'
YELLOW = '\033[93m'
CYAN = '\033[96m'
PURPLE = '\033[95m'
ENDC = '\033[0m'

results = []

for idx, item in enumerate(eval_data):
    question = item['question']
    context = item['original_context']

    prompt = generate_qa_prompt(question, context)

    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024).to("cuda")
    with torch.no_grad():
        output = model.generate(
            inputs.input_ids,
            max_new_tokens=256,
            temperature=0.3,
            do_sample=True,
            num_beams=4,
            top_p=0.9,
            repetition_penalty=1.2
        )
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract model's answer (everything after the instruction)
    model_answer = response.split("[/INST]")[-1].strip()
    normalized_model_answer = normalize_answer(model_answer)

    # Get correct answer
    correct_answer = item.get('answer', '')
    normalized_correct_answer = normalize_answer(correct_answer)

    # Check if answer is correct
    is_correct = normalized_model_answer == normalized_correct_answer
    if is_correct:
        correct_answers += 1

    # Store result
    results.append({
        'question': question,
        'context': context,
        'correct_answer': correct_answer,
        'model_answer': model_answer,
        'is_correct': is_correct
    })

    # Print progress and statistics
    questions_done = idx + 1
    questions_left = total_questions - questions_done
    elapsed_time = time.time() - start_time
    avg_time_per_question = elapsed_time / questions_done
    estimated_time_left = questions_left * avg_time_per_question
    current_accuracy = (correct_answers / questions_done) * 100

    print(f"\n{CYAN}Progress: {questions_done}/{total_questions} questions{ENDC}")
    print(f"{CYAN}Current accuracy: {current_accuracy:.1f}%{ENDC}")
    print(f"{CYAN}Average time per question: {avg_time_per_question:.1f} seconds{ENDC}")
    print(f"{CYAN}Estimated time remaining: {estimated_time_left/60:.1f} minutes{ENDC}\n")

    correct_color = GREEN if is_correct else RED
    print(f"\n{YELLOW}Question {idx}:{ENDC}")
    print(f"{BLUE}Context (truncated):{ENDC}")
    print(f"{BLUE}{context[:200]}...{ENDC}")
    print(f"{BLUE}Question: {question}{ENDC}")
    print(f"{YELLOW}Model answer:{ENDC}")
    print(f"{PURPLE}{model_answer}{ENDC}")
    print(f"{YELLOW}Correct answer:{ENDC}")
    print(f"{PURPLE}{correct_answer}{ENDC}")
    print(f"{correct_color}Correct: {is_correct}{ENDC}")

    # Save intermediate results every 10 questions
    if questions_done % 10 == 0:
        with open('evaluation_results_2wiki_intermediate.json', 'w') as f:
            json.dump(results, f, indent=2)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



[96mProgress: 1/300 questions[0m
[96mCurrent accuracy: 100.0%[0m
[96mAverage time per question: 1.3 seconds[0m
[96mEstimated time remaining: 6.5 minutes[0m


[93mQuestion 0:[0m
[94mContext (truncated):[0m
[94mMichael GovanMichael Govan( born 1963) is the director of the Los Angeles County Museum of Art since 2006. Prior to this, Govan worked as the director of the Dia Art Foundation in New York City.
John ...[0m
[94mQuestion: Michael GovanMichael Govan( born 1963) is the director of the Los Angeles County Museum of Art since 2006. Prior to this, Govan worked as the director of the Dia Art Foundation in New York City.
John DonatichJohn Donatich is the Director of Yale University Press.
Peter LevinPeter Levin is an American director of film, television and theatre.
Ian Barry (director)Ian Barry is an Australian director of film and TV.
John Farrell (businessman)John Farrell is the director of YouTube in Latin America.
One Law for the WomanOne Law for the Woman is a 1924 A

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



[96mProgress: 2/300 questions[0m
[96mCurrent accuracy: 50.0%[0m
[96mAverage time per question: 3.1 seconds[0m
[96mEstimated time remaining: 15.2 minutes[0m


[93mQuestion 1:[0m
[94mContext (truncated):[0m
[94mRalph MurphyRalph Murphy( May 1, 1895 – February 10, 1967) was an American film director. Born in Rockville, Connecticut, Murphy was active in films from 1931 through 1962, with some work in televisio...[0m
[94mQuestion: Ralph MurphyRalph Murphy( May 1, 1895 – February 10, 1967) was an American film director. Born in Rockville, Connecticut, Murphy was active in films from 1931 through 1962, with some work in television. From 1941 – 44 he was married to Gloria Dickson, whom he directed in" I Want a Divorce". His films include:
W. Augustus BarrattW. Augustus Barratt( 1873- 1947) was a Scottish- born, later American, songwriter and musician.
Claude WeiszClaude Weisz is a French film director born in Paris.
Jacques DécombeJacques Décombe is a French author, actor and d

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



[96mProgress: 3/300 questions[0m
[96mCurrent accuracy: 33.3%[0m
[96mAverage time per question: 4.9 seconds[0m
[96mEstimated time remaining: 24.1 minutes[0m


[93mQuestion 2:[0m
[94mContext (truncated):[0m
[94mHenry Erskine, 12th Earl of BuchanHenry David Erskine, 12th Earl of Buchan( July 1783 – 13 September 1857) was the grandson of the 10th Earl of Buchan. On 28 September 1809 he married Elizabeth Cole S...[0m
[94mQuestion: Henry Erskine, 12th Earl of BuchanHenry David Erskine, 12th Earl of Buchan( July 1783 – 13 September 1857) was the grandson of the 10th Earl of Buchan. On 28 September 1809 he married Elizabeth Cole Shipley( d.1828), daughter of Major- General Sir Charles Shipley. They had one child, David Stuart Erskine, 13th Earl of Buchan( 1815–1898). In 1833 he was living at 47 Minto Street in southern Edinburgh.
James Erskine, 6th Earl of BuchanJames Erskine, 6th Earl of Buchan( died 1640), was the eldest son of John Erskine, Earl of Mar, by his second wife, M

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



[96mProgress: 4/300 questions[0m
[96mCurrent accuracy: 25.0%[0m
[96mAverage time per question: 5.7 seconds[0m
[96mEstimated time remaining: 27.9 minutes[0m


[93mQuestion 3:[0m
[94mContext (truncated):[0m
[94mLa Bestia humanaLa Bestia humana is a 1957 Argentine film whose story is based on the novel" La Bête Humaine" by the French writer Émile Zola.
ZakhmZakhm( English:" Wound") is a 1998 Indian Hindi- lan...[0m
[94mQuestion: La Bestia humanaLa Bestia humana is a 1957 Argentine film whose story is based on the novel" La Bête Humaine" by the French writer Émile Zola.
ZakhmZakhm( English:" Wound") is a 1998 Indian Hindi- language drama film produced by Mukesh Bhatt for Pooja Bhatt Productions and directed by Mahesh Bhatt. The lead roles are played by Ajay Devgn, Pooja Bhatt, Sonali Bendre, Kunal Khemu and Nagarjuna, with music composed by M. M. Keeravani. The film won the Nargis Dutt Award for Best Feature Film on National Integration. However, the film was a commercially

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



[96mProgress: 5/300 questions[0m
[96mCurrent accuracy: 20.0%[0m
[96mAverage time per question: 6.3 seconds[0m
[96mEstimated time remaining: 31.0 minutes[0m


[93mQuestion 4:[0m
[94mContext (truncated):[0m
[94mOlav AaraasOlav Aaraas( born 10 July 1950) is a Norwegian historian and museum director. He was born in Fredrikstad. From 1982 to 1993 he was the director of Sogn Folk Museum, from 1993 to 2010 he was...[0m
[94mQuestion: Olav AaraasOlav Aaraas( born 10 July 1950) is a Norwegian historian and museum director. He was born in Fredrikstad. From 1982 to 1993 he was the director of Sogn Folk Museum, from 1993 to 2010 he was the director of Maihaugen and from 2001 he has been the director of the Norwegian Museum of Cultural History. In 2010 he was decorated with the Royal Norwegian Order of St. Olav.
George Montgomery (set decorator)George Montgomery (September 23, 1899 – March 5, 1951) was an American set decorator. He was nominated for an Academy Award in the category B

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



[96mProgress: 6/300 questions[0m
[96mCurrent accuracy: 16.7%[0m
[96mAverage time per question: 6.6 seconds[0m
[96mEstimated time remaining: 32.4 minutes[0m


[93mQuestion 5:[0m
[94mContext (truncated):[0m
[94mLegend of the Amazon WomenLegend of the Amazon Women is a beat' em up video game developed by SilverTime Inc and published by U.S. Gold and Mastertronic for Amstrad CPC, Commodore 64 and ZX Spectrum i...[0m
[94mQuestion: Legend of the Amazon WomenLegend of the Amazon Women is a beat' em up video game developed by SilverTime Inc and published by U.S. Gold and Mastertronic for Amstrad CPC, Commodore 64 and ZX Spectrum in 1986.
Hercules and the Amazon WomenHercules and the Amazon Women is the first movie- length pilot episode of the television series and marked the debut of Kevin Sorbo as the titular character Hercules and co-starred Anthony Quinn, Michael Hurst, Roma Downey and Lucy Lawless.
Thor and the Amazon WomenThor and the Amazon Women( Italian: Le gladiatrici,

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



[96mProgress: 7/300 questions[0m
[96mCurrent accuracy: 14.3%[0m
[96mAverage time per question: 6.9 seconds[0m
[96mEstimated time remaining: 33.7 minutes[0m


[93mQuestion 6:[0m
[94mContext (truncated):[0m
[94mLiving Very HappilyLiving Very Happily is Cantopop artist Miriam Yeung's fourteenth Cantonese studio album. It was released by Amusic on 16 October 2009. The album includes eleven new songs. The secon...[0m
[94mQuestion: Living Very HappilyLiving Very Happily is Cantopop artist Miriam Yeung's fourteenth Cantonese studio album. It was released by Amusic on 16 October 2009. The album includes eleven new songs. The second edition of the album came out on 2 December 2009. The second album includes a DVD with three music videos.
A Town South of BakersfieldA Town South of Bakersfield was a series of three compilation CDs showcasing New Country musicians in the late 1980s and early 1990s. The first album came out in 1986 and featured acts such as Dwight Yoakam. The Loneso

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



[96mProgress: 8/300 questions[0m
[96mCurrent accuracy: 12.5%[0m
[96mAverage time per question: 7.1 seconds[0m
[96mEstimated time remaining: 34.8 minutes[0m


[93mQuestion 7:[0m
[94mContext (truncated):[0m
[94mMarie of Luxembourg, Queen of FranceMarie of Luxembourg( 1304 – 26 March 1324), was by birth member of the House of Luxembourg and by marriage Queen of France and Navarre. She was the daughter of Henr...[0m
[94mQuestion: Marie of Luxembourg, Queen of FranceMarie of Luxembourg( 1304 – 26 March 1324), was by birth member of the House of Luxembourg and by marriage Queen of France and Navarre. She was the daughter of Henry VII, Holy Roman Emperor and Margaret of Brabant. Her two siblings were John of Luxembourg and Beatrice of Luxembourg, Queen of Hungary.
Lyon CohenLyon Cohen( 1868–1937) was a Polish- born Canadian businessman and a philanthropist. He was the grandfather of singer/ poet Leonard Cohen.
John WestleyRev. John Westley( 1636 – 78) was an English nonconform

KeyboardInterrupt: 

In [None]:
# Calculate and display final metrics
final_accuracy = (correct_answers / total_questions) * 100

print("=== Final Evaluation Metrics ===")
print(f"Total examples evaluated: {total_questions}")
print(f"Correct answers: {correct_answers}")
print(f"Accuracy: {final_accuracy:.1f}%")

# Save final results
timestamp = time.strftime("%Y%m%d-%H%M%S")
output_path = f'evaluation_results_2wiki_{timestamp}.json'
with open(output_path, 'w') as f:
    json.dump({
        'metrics': {
            'total_questions': total_questions,
            'correct_answers': correct_answers,
            'accuracy': final_accuracy
        },
        'results': results
    }, f, indent=2)
print(f"\nDetailed results saved to {output_path}")