In [4]:
import os
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
from typing import Dict, List
import time

# Device setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
proxy = "http://sisproxy.hkg.agoda.local:3128"
proxy_config = {"http": proxy, "https": proxy}

def load_qwen():
    """Load Qwen model and tokenizer"""
    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto",
        proxies=proxy_config,
        trust_remote_code=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        proxies=proxy_config,
        trust_remote_code=True
    )
    
    return model, tokenizer

def load_predictions_and_squad(prediction_files: List[str], squad_file: str) -> Dict:
    """Load predictions from multiple files and original SQuAD data"""
    # Load predictions
    predictions_by_id = {}
    model_names = ["Model1 (LLaMA)", "Model2 (T5)", "Model3 (Gemma)"]
    
    for file_path, model_name in zip(prediction_files, model_names):
        with open(file_path, 'r') as f:
            predictions = json.load(f)
            for qid, answer in predictions.items():
                if qid not in predictions_by_id:
                    predictions_by_id[qid] = []
                predictions_by_id[qid].append((model_name, answer))
    
    # Load SQuAD data
    with open(squad_file, 'r') as f:
        squad_data = json.load(f)
    
    # Create mapping of question IDs to questions and contexts
    qa_info = {}
    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                qa_info[qa['id']] = {
                    'question': qa['question'],
                    'context': context
                }
    
    return predictions_by_id, qa_info

def get_ensemble_prediction(
    model, 
    tokenizer, 
    question: str, 
    context: str, 
    predictions: List[tuple]
) -> str:
    """Get ensemble prediction using Qwen"""
    # Format predictions string
    pred_str = "\n".join([f"{name}: {pred}" for name, pred in predictions])
    
    # Create messages for chat template
    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert at reading comprehension and answer analysis. Your task is to determine the most accurate answer based on the given context and model predictions, but feel free to give other answer if you think none of the given predictions are correct."
        },
        {
            "role": "user",
            "content": f"""Context: {context}

Question: {question}

Different model predictions:
{pred_str}

Based on the context and these predictions, what is the most accurate answer? Only output the exact answer text, no explanations."""
        }
    ]
    
    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Generate response
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    with torch.inference_mode():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
    
    # Extract only the new tokens
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids 
        in zip(model_inputs.input_ids, generated_ids)
    ]
    
    # Decode response
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return response.strip()

def create_ensemble_predictions(squad_file: str, prediction_files: List[str], output_file: str):
    """Create ensemble predictions for SQuAD dataset"""
    print("Loading Qwen model...")
    model, tokenizer = load_qwen()
    
    print("Loading predictions and SQuAD data...")
    predictions_by_id, qa_info = load_predictions_and_squad(prediction_files, squad_file)
    
    print("Generating ensemble predictions...")
    final_predictions = {}
    
    for qid, predictions in tqdm(predictions_by_id.items()):
        if qid not in qa_info:
            continue
            
        question = qa_info[qid]['question']
        context = qa_info[qid]['context']
        
        ensemble_prediction = get_ensemble_prediction(
            model, 
            tokenizer,
            question,
            context,
            predictions
        )
        
        final_predictions[qid] = ensemble_prediction
    
    print("Saving predictions...")
    with open(output_file, 'w') as f:
        json.dump(final_predictions, f, indent=2)
    
    return final_predictions

def main():
    # File paths
    squad_file = "dev-v1.1.json"
    prediction_files = [
        "predictions_llama.json",
        "t5_base_predictions.json",
        "dev-v1.1-gemma-it-qa-3-processed.json"
    ]
    output_file = "qwen_ensemble_predictions.json"
    
    try:
        # Create ensemble predictions
        predictions = create_ensemble_predictions(squad_file, prediction_files, output_file)
        
        # Print sample predictions
        print("\nSample predictions:")
        for qid in list(predictions.keys())[:3]:
            print(f"\nQuestion ID: {qid}")
            print(f"Prediction: {predictions[qid]}")
        
        # Print memory usage
        if torch.cuda.is_available():
            memory_allocated = torch.cuda.memory_allocated(0) / 1024**2
            memory_reserved = torch.cuda.memory_reserved(0) / 1024**2
            print(f"\nGPU Memory Usage:")
            print(f"Allocated: {memory_allocated:.2f} MB")
            print(f"Reserved: {memory_reserved:.2f} MB")
            
    except Exception as e:
        print(f"Error: {str(e)}")
        raise

if __name__ == "__main__":
    main()

Loading Qwen model...
Loading predictions and SQuAD data...
Generating ensemble predictions...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10570/10570 [48:53<00:00,  3.60it/s]

Saving predictions...

Sample predictions:

Question ID: 56be4db0acb8001400a502ec
Prediction: Denver Broncos

Question ID: 56be4db0acb8001400a502ed
Prediction: Carolina Panthers

Question ID: 56be4db0acb8001400a502ee
Prediction: Levi's Stadium in the San Francisco Bay Area

GPU Memory Usage:
Allocated: 2956.16 MB
Reserved: 6714.00 MB





In [None]:
import os
import torch
import json
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
from typing import Dict, List
import time

# Device setup
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
proxy = "http://sisproxy.hkg.agoda.local:3128"
proxy_config = {"http": proxy, "https": proxy}

def load_qwen():
    """Load Qwen model and tokenizer"""
    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto",
        proxies=proxy_config,
        trust_remote_code=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        proxies=proxy_config,
        trust_remote_code=True
    )
    
    return model, tokenizer

def load_predictions_and_squad(prediction_files: List[str], squad_file: str) -> Dict:
    """Load predictions from multiple files and original SQuAD data"""
    # Load predictions
    predictions_by_id = {}
    model_names = ["LLaMA-3.2B", "Phi-3.5"]
    
    for file_path, model_name in zip(prediction_files, model_names):
        with open(file_path, 'r') as f:
            predictions = json.load(f)
            for qid, result in predictions.items():
                if qid not in predictions_by_id:
                    predictions_by_id[qid] = []
                predictions_by_id[qid].append((model_name, result['res']))
    
    # Load SQuAD data
    with open(squad_file, 'r') as f:
        squad_data = json.load(f)
    
    # Create mapping of question IDs to questions and contexts
    qa_info = {}
    for article in squad_data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                qa_info[qa['id']] = {
                    'question': qa['question'],
                    'context': context
                }
    
    return predictions_by_id, qa_info

def get_ensemble_prediction(
    model, 
    tokenizer, 
    question: str, 
    context: str, 
    predictions: List[tuple]
) -> str:
    """Get ensemble prediction using Qwen"""
    # Format predictions string
    pred_str = "\n".join([f"{name}: {pred}" for name, pred in predictions])
    
    # Create messages for chat template
    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert at reading comprehension and answer analysis. Your task is to determine the most accurate answer based on the given context, model predictions, and their explanations. Each model provides both an answer and its reasoning. If none of the predictions are satisfactory, you should provide your own answer based on the context."
        },
        {
            "role": "user",
            "content": f"""Context: {context}

Question: {question}

Different model predictions (including their answers and explanations):
{pred_str}

Based on the context and the model predictions (considering both their answers and explanations), what is the most accurate answer? Only output the exact answer text, no explanations."""
        }
    ]
    
    # Apply chat template
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Generate response
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    
    with torch.inference_mode():
        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=100,
            do_sample=True,
            temperature=0.7,
            top_p=0.9
        )
    
    # Extract only the new tokens
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids 
        in zip(model_inputs.input_ids, generated_ids)
    ]
    
    # Decode response
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    return response.strip()

def create_ensemble_predictions(squad_file: str, prediction_files: List[str], output_file: str):
    """Create ensemble predictions for SQuAD dataset"""
    print("Loading Qwen model...")
    model, tokenizer = load_qwen()
    
    print("Loading predictions and SQuAD data...")
    predictions_by_id, qa_info = load_predictions_and_squad(prediction_files, squad_file)
    
    print("Generating ensemble predictions...")
    final_predictions = {}
    
    for qid, predictions in tqdm(predictions_by_id.items()):
        if qid not in qa_info:
            continue
            
        question = qa_info[qid]['question']
        context = qa_info[qid]['context']
        
        ensemble_prediction = get_ensemble_prediction(
            model, 
            tokenizer,
            question,
            context,
            predictions
        )
        
        final_predictions[qid] = ensemble_prediction
    
    print("Saving predictions...")
    with open(output_file, 'w') as f:
        json.dump(final_predictions, f, indent=2)
    
    return final_predictions

def main():
    # File paths
    squad_file = "dev-v1.1.json"
    prediction_files = [
        "llama-3.2-3b-explanation-tuned.json",
        "phi-3.5-mini-instruct-explanation-tuned.json"
    ]
    output_file = "qwen_ensemble_predictions_2.json"
    
    try:
        # Create ensemble predictions
        predictions = create_ensemble_predictions(squad_file, prediction_files, output_file)
        
        # Print sample predictions
        print("\nSample predictions:")
        for qid in list(predictions.keys())[:3]:
            print(f"\nQuestion ID: {qid}")
            print(f"Prediction: {predictions[qid]}")
        
        # Print memory usage
        if torch.cuda.is_available():
            memory_allocated = torch.cuda.memory_allocated(0) / 1024**2
            memory_reserved = torch.cuda.memory_reserved(0) / 1024**2
            print(f"\nGPU Memory Usage:")
            print(f"Allocated: {memory_allocated:.2f} MB")
            print(f"Reserved: {memory_reserved:.2f} MB")
            
    except Exception as e:
        print(f"Error: {str(e)}")
        raise

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


Loading Qwen model...




Loading predictions and SQuAD data...
Generating ensemble predictions...


 45%|██████████████████████████████████████████████████████████████████████████████████████▎                                                                                                           | 4704/10570 [25:15<31:55,  3.06it/s]

In [1]:
!python evaluate-v2.0.py dev-v1.1.json qwen_ensemble_predictions.json

{
  "exact": 78.81740775780511,
  "f1": 88.60037451520424,
  "total": 10570,
  "HasAns_exact": 78.81740775780511,
  "HasAns_f1": 88.60037451520424,
  "HasAns_total": 10570
}


In [7]:
!python evaluate-v2.0.py dev-v1.1.json t5_base_predictions.json

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


{
  "exact": 80.54872280037843,
  "f1": 89.39400002730301,
  "total": 10570,
  "HasAns_exact": 80.54872280037843,
  "HasAns_f1": 89.39400002730301,
  "HasAns_total": 10570
}
