In [1]:
import os
import platform
import random
import json
import torch
import mlx_lm
import math
import pandas as pd
from tqdm import tqdm
from abc import ABC, abstractmethod
IS_MAC = platform.system() == "Darwin"
HAS_CUDA = False if IS_MAC else torch.cuda.is_available()

class ModelLoader(ABC):
    @abstractmethod
    def load(self, model_name: str, adapter_path: str = None):
        pass

    @abstractmethod
    def generate(self, model, tokenizer, prompt: str, max_tokens: int = 500):
        pass

class MLXLoader(ModelLoader):
    def load(self, model_name, adapter_path=None):
        from mlx_lm import load
        return load(model_name, adapter_path=adapter_path)
    
    def generate(self, model, tokenizer, prompt, max_tokens=500, verbose=False):
        from mlx_lm import generate
        return generate(model, tokenizer, prompt=prompt, max_tokens=max_tokens, verbose=verbose)

class CUDALoader(ModelLoader):
    def load(self, model_name, adapter_path=None):
        from transformers import AutoModelForCausalLM, AutoTokenizer
        import torch
        
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float16
        )
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        if adapter_path:
            from peft import PeftModel
            model = PeftModel.from_pretrained(model, adapter_path)
            
        return model, tokenizer
    
    def generate(self, model, tokenizer, prompt, max_tokens=500):
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

def get_model_loader():
    if IS_MAC:
        return MLXLoader()
    return CUDALoader()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False
    
# Load the test data
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

def read_first_message(data_path='../data/poker-preflop/test.jsonl'):
    with open(data_path, 'r') as f:
        first_line = f.readline()
        message = json.loads(first_line)
        message = message['messages']
        return message


def read_line_with_index(line_index, data_path='../data/poker-preflop/test.jsonl'):
    with open(data_path, 'r') as f:
        for idx, line in enumerate(f):
            if idx == line_index:
                message = json.loads(line)
                message = message['messages']
                return message
    return None


def read_random_message(data_path='../data/poker-preflop/test.jsonl'):
    messages = []
    with open(data_path, 'r') as f:
        messages = [line for line in f]
    random_line = random.choice(messages)
    message = json.loads(random_line)
    message = message['messages']
    return message


def calculate_aa_em(predictions, ground_truths, alpha=5):
    aa = 0
    em = 0
    for i in range(len(predictions)):
        action_predicted = predictions[i].strip().split()[0].lower()
        action_ground_truth = ground_truths[i].strip().split()[0].lower()
        if action_predicted == action_ground_truth:
            aa += 1
            if action_ground_truth == 'bet' or action_ground_truth == 'raise':
                amount_predicted = predictions[i].strip().split()[-1]
                amount_ground_truth = ground_truths[i].strip().split()[-1]
                # check if amount is a number
                if is_number(amount_predicted) and is_number(amount_ground_truth):
                    real_error = abs(float(amount_predicted) - float(amount_ground_truth))/float(amount_ground_truth)
                    em += math.exp(-alpha * (real_error ** 2))
            else:
                em += 1
    return aa / len(predictions), em / len(predictions)

In [3]:
models_list = ["meta-llama/Meta-Llama-3.1-8B-Instruct",
                "meta-llama/Meta-Llama-3.1-8B-Instruct",
                "meta-llama/Meta-Llama-3-8B-Instruct",
                "meta-llama/Meta-Llama-3-8B-Instruct",
                "meta-llama/Llama-3.2-3B-Instruct",
                "meta-llama/Llama-3.2-3B-Instruct",
                "meta-llama/Llama-3.2-3B-Instruct",
                "google/gemma-2-9b-it",
                "google/gemma-2-9b-it",
                "Qwen/Qwen2.5-7B-Instruct-1M",
                "Qwen/Qwen2.5-7B-Instruct-1M",
                "Qwen/Qwen2.5-14B-Instruct-1M",
                "Qwen/Qwen2.5-14B-Instruct-1M",
                ]

adapters_list = ["Meta-Llama-3.1-8B-Instruct", #original
                "lora-Meta-Llama-3.1-8B-Instruct",
                "Meta-Llama-3-8B-Instruct", #original
                "lora-Meta-Llama-3-8B-Instruct-lr-6",
                "Llama-3.2-3B-Instruct",  # original
                "lora-Llama-3.2-3B-Instruct-lr-5",
                "lora-Llama-3.2-3B-Instruct-lr-6",
                 "gemma-2-9b-it",  # original
                 "lora-gemma-2-9b-it",
                 "Qwen2.5-7B-Instruct-1M",  # original
                 "lora-Qwen2.5-7B-Instruct-1M", 
                 "Qwen2.5-14B-Instruct-1M",  # original
                 "lora-Qwen2.5-14B-Instruct-1M", 
                ]
index = 10
testing_set = "postflop"
data_path = f'../data/poker-{testing_set}/test.jsonl'
loader = get_model_loader()
model_name = models_list[index]
adapter_name = adapters_list[index]
print(f"Loading model {model_name} with adapter {adapter_name}")
model, tokenizer = loader.load(model_name
, adapter_path=f"../adapters/{adapter_name}"
)

Loading model Qwen/Qwen2.5-7B-Instruct-1M with adapter lora-Qwen2.5-7B-Instruct-1M


Fetching 12 files: 100%|██████████| 12/12 [00:00<00:00, 100865.03it/s]


In [4]:
if False:
    message = read_random_message(data_path=data_path)
    print(message[-1]['content'])
    message.pop()
    prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True)
    response = loader.generate(model, tokenizer, prompt)
    print(response)

In [5]:
# ground_truths = []
# predictions = []
# message2 = read_random_message()
# print(message2[-1]['content'])
# ground_truths.append(message2[-1]['content'])
# message2.pop()
# prompt2 = tokenizer.apply_chat_template(message2, add_generation_prompt=True)

# response = loader.generate(model, tokenizer, prompt)
# print(response)
# predictions.append(response)
# aa, em = calculate_aa_em(predictions, ground_truths)
# print(f"AA: {aa}, EM: {em}")

In [6]:
ground_truths = []
predictions = []

# ground_truths.append("raise 22.0")
# predictions.append("Raise 20.0")

# aa, em = calculate_aa_em(predictions, ground_truths)
# test = "raise 22.0"

# number =test.strip().split()[-1].lower()
# print(is_number(number))

# print(f"AA: {aa}, EM: {em}")

In [7]:
# Load test data
test_data = load_jsonl(data_path)
print(f"Loaded {len(test_data)} examples from test.jsonl")

# for i in tqdm(range(10), desc="Processing"):
for i in tqdm(range(len(test_data)), desc="Processing"):
    example = test_data[i]
    message = example['messages']
    # print(message[-1]['content'])
    ground_truths.append(message[-1]['content'])
    message.pop()
    prompt = tokenizer.apply_chat_template(message, add_generation_prompt=True)
    response = loader.generate(model, tokenizer, prompt)
    # print(response)
    predictions.append(response)

aa, em = calculate_aa_em(predictions, ground_truths)
print(f"AA: {aa}, EM: {em}")

Loaded 10000 examples from test.jsonl


Processing:  85%|████████▌ | 8548/10000 [1:47:40<18:17,  1.32it/s]  


KeyboardInterrupt: 

In [None]:
print(predictions)
print(ground_truths)

['check', 'check', 'check', 'check', 'check', 'call', 'raise 39', 'check', 'call', 'call', 'call', 'check', 'check', 'check', 'check', 'fold', 'raise 60', 'fold', 'check', 'call', 'check', 'check', 'fold', 'fold', 'raise 37', 'call', 'check', 'bet 7', 'check', 'call', 'fold', 'call', 'call', 'check', 'fold', 'call', 'call', 'raise 48', 'call', 'check', 'fold', 'call', 'call', 'call', 'check', 'check', 'check', 'fold', 'call', 'check', 'check', 'check', 'call', 'fold', 'call', 'fold', 'check', 'check', 'fold', 'fold', 'call', 'call', 'fold', 'fold', 'call', 'fold', 'raise 40', 'fold', 'call', 'call', 'fold', 'call', 'call', 'fold', 'fold', 'call', 'fold', 'call', 'check', 'fold', 'call', 'call', 'fold', 'check', 'fold', 'fold', 'check', 'check', 'fold', 'check', 'fold', 'check', 'check', 'check', 'call', 'call', 'fold', 'fold', 'bet 5', 'check', 'fold', 'fold', 'fold', 'call', 'call', 'check', 'fold', 'fold', 'raise 30', 'raise 40', 'check', 'raise 40', 'check', 'call', 'call', 'check',

In [12]:
if True:
    # Create a DataFrame for predictions and ground truths
    results_df = pd.DataFrame({
        "Prediction": predictions,
        "Ground Truth": ground_truths
    })

    output_path = "../testing-results"
    # Save the DataFrames to CSV files
    results_df.to_csv(f"{output_path}/{adapter_name}-{testing_set}_predictions.csv", index=False)
    # metrics_df.to_csv(f"{output_path}/{adapter_name}_metrics.csv", index=False)