In [1]:
import sys
sys.path.append("/home/mingzhe/Projects/Afterburner")

import os
import time
import json
import utils
from openai import OpenAI
from datasets import load_dataset, Dataset

In [2]:
client = OpenAI(
    base_url="https://api.studio.nebius.com/v1/",
    api_key="TOKEN",
)

In [2]:
GENERATION_TEMPLATE = """
## Instructions
You are an expert competitive programmer who excels at solving algorithm problems in multiple programming languages.
Your task is to implement a solution to the following problem in {target_lang}.

## Problem Description
{question}

## Starter Code
{starter_code}

## Output Format
- Provide the complete solution code in **one markdown code block** with appropriate language identifier.
- Implement the function with the exact signature (name, parameters, etc.) specified in the starter code.
- EXCLUDE ALL explanations, code comments, import/package/library statements, additional classes or functions outside of the starter code scope, or starting code like `if __name__ == "__main__":` or `func main()` or `package main` or `using namespace std;`.
"""

In [7]:
task = 'apps'

In [4]:
venus_dataset = load_dataset("Elfsong/Venus_Python", split=f"test")
apps_dataset = load_dataset("Elfsong/Apps_Python", split=f"test")

In [None]:
request_list = list()
for instance in venus_dataset if task == 'venus' else apps_dataset:
    prompt = GENERATION_TEMPLATE.format(
        target_lang="python",
        question=instance['question_content'] if task == 'venus' else instance['problem_content'],
        starter_code=utils.wrap_code_block("python", instance['code_prompt']),
    )
    request_body = {
        "custom_id": f"request-{instance['problem_id']}", 
        "method": "POST", 
        "url": "/v1/chat/completions", 
        "body": {
            "model": "Qwen/QwQ-32B", 
            "messages": [
                {"role": "user", "content": prompt}],
            "max_completion_tokens": 8192*4
        }
    }
    request_list.append(request_body)
    
with open("nebius_batch_requests.jsonl", "w") as f:
    for request in request_list:
        f.write(json.dumps(request) + "\n")

In [None]:
# Upload the file
batch_requests = client.files.create(
    file=open("nebius_batch_requests.jsonl", "rb"),
    purpose="batch"
)
print(batch_requests)

In [None]:
# Create a batch
response = client.batches.create(
    input_file_id=batch_requests.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": f"QwQ-32B {task} Generation"
    }
)
print(response)

In [4]:
for batch in client.batches.list():
    print(batch)

Batch(id='batch_e99adb9c-d359-47e5-978a-7e820c0a2698', completion_window='24h', created_at=1745697027, endpoint='/v1/chat/completions', input_file_id='file-d5cf6f24-ba7d-4da4-b265-ba7a8a2f8b27', object='batch', status='done', cancelled_at=None, cancelling_at=None, completed_at=1745700933, error_file_id=None, errors=None, expired_at=None, expires_at=None, failed_at=None, finalizing_at=1745700933, in_progress_at=1745697031, metadata={'description': 'Asynchronous job'}, output_file_id='39b7c2c6-1bc0-4b5b-9f4e-1bc7e63ed238', request_counts=BatchRequestCounts(completed=300, failed=0, total=300))
Batch(id='batch_65c6c74f-e794-4c7d-b8fc-d0ac03a60098', completion_window='24h', created_at=1746189986, endpoint='/v1/chat/completions', input_file_id='file-60afa97b-7760-4c87-ad9e-f4694871f801', object='batch', status='done', cancelled_at=None, cancelling_at=None, completed_at=1746190930, error_file_id=None, errors=None, expired_at=None, expires_at=None, failed_at=None, finalizing_at=1746190929, in_

In [None]:
# Check the status of the batch
status = 'processing'
batch_id = "batch_2e840e54-eb30-4e07-97ed-e00aabfec2a0"
while status != "done":
    batch_info = client.batches.retrieve(batch_id)
    status = batch_info.status
    print(f'Batch status: {status} - {batch_info.request_counts}')
    if status == "done": break
    time.sleep(15)

Batch status: done - BatchRequestCounts(completed=300, failed=0, total=300)


In [5]:
# Get the results of the batch
batch_result = client.files.content(batch_info.output_file_id)

In [8]:
solutions = list()
for line in batch_result.iter_lines():
    instance = json.loads(line)
    problem_id = instance['custom_id'].split('-')[1]
    model_response = instance['response']['choices'][0]['message']['content']
    
    try:
        code = utils.extract_code_blocks(model_response)[0]['code']
    except Exception as e:
        print(f"[-] No code blocks found. Will return the whole response.")
        code = model_response
    
    solutions.append({"problem_id": problem_id, "solution": code})
    
ds = Dataset.from_list(solutions)
if task == 'venus':
    ds.push_to_hub("Elfsong/Venus_Model_Evaluation", 'qwq_32b', private=True)
else:
    ds.push_to_hub("Elfsong/Apps_Model_Evaluation", 'qwq_32b', private=True)


[-] No code blocks found. Will return the whole response.
[-] No code blocks found. Will return the whole response.
[-] No code blocks found. Will return the whole response.
[-] No code blocks found. Will return the whole response.
[-] No code blocks found. Will return the whole response.
[-] No code blocks found. Will return the whole response.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/4.77k [00:00<?, ?B/s]