In [3]:
import os
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import time

# Device setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
proxy = "http://sisproxy.hkg.agoda.local:3128"
proxy_config = {"http": proxy, "https": proxy}

def load_qwen():
    """Load Qwen model and tokenizer"""
    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto",
        proxies=proxy_config,
        trust_remote_code=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        proxies=proxy_config,
        trust_remote_code=True
    )
    
    return model, tokenizer

def load_squad_data(squad_file: str):
    """Load SQuAD data and return question IDs with corresponding questions and contexts"""
    with open(squad_file, 'r') as f:
        squad_data = json.load(f)
    
    qa_info = {}
    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                qa_info[qa['id']] = {
                    'question': qa['question'],
                    'context': context
                }
    
    return qa_info

def get_qwen_prediction(model, tokenizer, question: str, context: str) -> str:
    """Get direct prediction from Qwen for a single question"""
    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert at reading comprehension. Your task is to answer the given question based on the provided context. Only provide the exact answer text, no explanations."
        },
        {
            "role": "user",
            "content": f"""Context: {context}

Question: {question}

Answer the question based on the context above. Provide only the exact answer text, no explanations."""
        }
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    with torch.inference_mode():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=50,  # Reduced since we expect shorter answers
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
    
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids 
        in zip(model_inputs.input_ids, generated_ids)
    ]
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return response.strip()

def create_qwen_predictions(squad_file: str, output_file: str):
    """Create direct Qwen predictions for SQuAD dataset"""
    print("Loading Qwen model...")
    model, tokenizer = load_qwen()
    
    print("Loading SQuAD data...")
    qa_info = load_squad_data(squad_file)
    
    print("Generating Qwen predictions...")
    predictions = {}
    
    for qid, info in tqdm(qa_info.items()):
        prediction = get_qwen_prediction(
            model,
            tokenizer,
            info['question'],
            info['context']
        )
        predictions[qid] = prediction
    
    print("Saving predictions...")
    with open(output_file, 'w') as f:
        json.dump(predictions, f, indent=2)
    
    return predictions

def main():
    # File paths
    squad_file = "dev-v1.1.json"
    output_file = "qwen_direct_predictions.json"
    
    try:
        # Create predictions
        predictions = create_qwen_predictions(squad_file, output_file)
        
        # Print sample predictions
        print("\nSample predictions:")
        for qid in list(predictions.keys())[:3]:
            print(f"\nQuestion ID: {qid}")
            print(f"Prediction: {predictions[qid]}")
        
        # Print memory usage
        if torch.cuda.is_available():
            memory_allocated = torch.cuda.memory_allocated(0) / 1024**2
            memory_reserved = torch.cuda.memory_reserved(0) / 1024**2
            print(f"\nGPU Memory Usage:")
            print(f"Allocated: {memory_allocated:.2f} MB")
            print(f"Reserved: {memory_reserved:.2f} MB")
            
    except Exception as e:
        print(f"Error: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Loading Qwen model...
Loading SQuAD data...
Generating Qwen predictions...


 93%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎             | 9822/10570 [43:19<02:36,  4.79it/s]IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [4]:
!python evaluate-v2.0.py dev-v1.1.json qwen_direct_predictions.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{
  "exact": 71.9205298013245,
  "f1": 83.09935420932386,
  "total": 10570,
  "HasAns_exact": 71.9205298013245,
  "HasAns_f1": 83.09935420932386,
  "HasAns_total": 10570
}


In [7]:
!python evaluate-v2.0.py dev-v1.1.json t5_base_predictions.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{
  "exact": 80.54872280037843,
  "f1": 89.39400002730301,
  "total": 10570,
  "HasAns_exact": 80.54872280037843,
  "HasAns_f1": 89.39400002730301,
  "HasAns_total": 10570
}
