In [1]:
from datasets import load_dataset

ds = load_dataset("RZ412/PokerBench")

In [2]:
ds.shape

{'train': (563200, 2), 'test': (11000, 2)}

In [3]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("OPENROUTER_API_KEY")
ds_key = os.getenv("DEEPSEEK_API_KEY")

In [4]:
test_data = ds['test']

In [5]:
import requests
def query_llm(prompt, model):
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "prompt": prompt,
       # "temperature": 0.1,
        #"top_p": 0.95
    }
    response = requests.post("https://openrouter.ai/api/v1/completions", headers=headers, json=payload)
    response_data = response.json()
    if "choices" not in response_data:
        print(response_data)
        return
    txt = response.json()["choices"][0]["text"]
    return txt

In [68]:
from openai import OpenAI

client = OpenAI(api_key=api_key, base_url="https://openrouter.ai/api/v1")

def query_deepseek(prompt):
    try:
        response = client.chat.completions.create(
            model="deepseek/deepseek-r1-distill-qwen-32b",
            messages=[{"role": "user", "content": prompt}],
            #temperature=0.1, 
            #top_p=0.95
        )
        if response.choices:
            return response.choices[0].message.content
        else:
            print("No response choices available.")
            return None

    except Exception as e:
        print(f"Error querying DeepSeek API: {e}")
        return None


In [7]:
model_deepseek = "deepseek/deepseek-r1"
model_llama = "meta-llama/llama-3.3-70b-instruct"
model_gemini = "google/gemini-2.0-flash-lite-preview-02-05:free"

In [8]:
instructions = test_data['instruction']
outputs = test_data['output']

In [69]:
import json
import time

def batch_query_deepseek(test_subset, output_file="results.json", save_every=50):
    """Batch queries DeepSeek and saves responses to a JSON file."""
    
    results = []
    
    for i, entry in enumerate(test_subset):
        response = query_deepseek(entry["instruction"])
        results.append({"Index": i, "Response": response})

        if (i + 1) % save_every == 0:
            with open(output_file, "w") as f:
                json.dump(results, f, indent=4)
            print(f"✅ Saved {i+1}/{len(test_subset)} results to {output_file}")

    with open(output_file, "w") as f:
        json.dump(results, f, indent=4)
    
    print("✅ Done! Results saved to:", output_file)


In [72]:
np.random.seed(42)
test_data_array = np.array(test_data)
random_indices = np.random.choice(len(test_data_array), size=400, replace=False)
test_subset = test_data_array[random_indices] 

In [73]:
batch_query_deepseek(test_subset, output_file="deepseek_results.json", save_every=20)

✅ Saved 20/400 results to deepseek_results.json
✅ Saved 40/400 results to deepseek_results.json
✅ Saved 60/400 results to deepseek_results.json
✅ Saved 80/400 results to deepseek_results.json
No response choices available.
✅ Saved 100/400 results to deepseek_results.json
No response choices available.
✅ Saved 120/400 results to deepseek_results.json
No response choices available.
No response choices available.
✅ Saved 140/400 results to deepseek_results.json
✅ Saved 160/400 results to deepseek_results.json
No response choices available.
✅ Saved 180/400 results to deepseek_results.json
No response choices available.
No response choices available.
No response choices available.
✅ Saved 200/400 results to deepseek_results.json
✅ Saved 220/400 results to deepseek_results.json
✅ Saved 240/400 results to deepseek_results.json
✅ Saved 260/400 results to deepseek_results.json
✅ Saved 280/400 results to deepseek_results.json
✅ Saved 300/400 results to deepseek_results.json
✅ Saved 320/400 resul

In [97]:
import re

def extract_action(response):
    if not response:
        return "fail"
    # Define the regex pattern to match the action and optional value
    pattern = r"(bet|raise|call|fold|check)\s*(\d*)"
    
    # Search for the pattern in the response
    match = re.search(pattern, response, re.IGNORECASE)
    
    if match:
        action = match.group(1).lower()  # Extract the action (e.g., "bet")
        value = match.group(2) if match.group(2) else None  # Extract the value (e.g., "4")
        
        # Return the action and value (if applicable)
        if value:
            return f"{action} {value}"
        else:
            return action
    else:
        return "fail"

In [101]:
instructions = test_data['instruction']
outputs = test_data['output']

predictions = [query_llm(instruction, model_gemini) for instruction in instructions[:18]]

aa, em = evaluate_model(predictions, outputs[:18])

print(f"Action Accuracy (AA): {aa:.2f}%")
print(f"Exact Match Accuracy (EM): {em:.2f}%")

Check
Check
Check
Bet 10 chips
Check
Call
Call
Check
Call
Call
Call
Bet 20 chips
Check
Check
Check
Fold
Call
Call
Action Accuracy (AA): 83.33%
Exact Match Accuracy (EM): 77.78%


In [93]:
with open("deepseek_results.json", "r") as f:
    deepseek_results_data = json.load(f)
    
deepseek_results = [entry["Response"] for entry in deepseek_results_data]

In [103]:
subset_outputs = [entry['output'] for entry in test_subset]

deepseek_processed_results = [extract_action(res) for res in deepseek_results]
aa, em = evaluate_model(deepseek_processed_results, subset_outputs)
print(f"Action Accuracy (AA): {aa:.2f}%")
print(f"Exact Match Accuracy (EM): {em:.2f}%")

Action Accuracy (AA): 43.00%
Exact Match Accuracy (EM): 37.75%


In [91]:
def evaluate_model(predictions, ground_truths):
    action_correct = 0
    exact_match_correct = 0
    total = len(ground_truths)

    for pred, true in zip(predictions, ground_truths):
        pred_action = pred.lower().split()[0]
        true_action = true.lower().split()[0]
        
        # Action Accuracy: Does the action match?
        if pred_action == true_action:
            action_correct += 1
        
        # Exact Match Accuracy: Does the entire response match?
        if pred.lower() == true.lower():
            exact_match_correct += 1

    action_accuracy = (action_correct / total) * 100
    exact_match_accuracy = (exact_match_correct / total) * 100

    return action_accuracy, exact_match_accuracy