In [None]:
%%capture
# Install dependencies
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [None]:
from google.colab import drive
drive.mount('/content/drive')
cwd = '/content/drive/MyDrive'

Mounted at /content/drive


In [None]:
from unsloth import FastLanguageModel
from tqdm import tqdm
from transformers import TextStreamer
from datasets import load_dataset
import random
import pandas as pd
import torch
import re
import json

In [None]:
# Budget forcing parameters
MAX_TOKENS_THINKING = 300  # Max tokens for thinking phase
NUM_IGNORE = 1  # How many times to ignore end-of-thinking and continue reasoning

In [None]:
# Model configuration
pretrained_model_name = "unsloth/Meta-Llama-3.1-8B-Instruct"
max_seq_length = 2048  # Choose any! Unsloth supports RoPE Scaling internally
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage

# Load the pretrained model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=pretrained_model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

==((====))==  Unsloth 2025.3.14: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
    (layers): ModuleList(
      (0): LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((409

In [None]:
# read test set as pre-flop and post-flop
dataset_dir = f"{cwd}/Data"

with open(f'{dataset_dir}/postflop_10k_test_set_prompt_and_label.json', 'r') as f:
  postflop_test_set = json.load(f)

with open(f'{dataset_dir}/preflop_1k_test_set_prompt_and_label.json', 'r') as f:
  preflop_test_set = json.load(f)

In [None]:
# Sample 100 instances from each of the datasets
from sklearn.model_selection import train_test_split

# Function to categorize actions
def categorize_action(output):
    if 'bet' in output:
        return 'bet'
    elif 'raise' in output:
        return 'raise'
    elif 'check' in output:
        return 'check'
    elif 'fold' in output:
        return 'fold'
    else:
        return 'other'  # Should not happen based on dataset description

# Function to perform stratified sampling over action categories
def stratified_sample(dataset, sample_size=100):
    # Assign category labels
    action_categories = [categorize_action(item['output']) for item in dataset]

    # Perform stratified sampling
    _, sampled_indices = train_test_split(
        range(len(dataset)),
        test_size=sample_size,
        stratify=action_categories,
        random_state=42
    )

    # Select sampled examples
    sampled_dataset = [dataset[i] for i in sampled_indices]

    return sampled_dataset

# Sample 100 examples from each dataset
postflop_sampled = stratified_sample(postflop_test_set, 100)
preflop_sampled = stratified_sample(preflop_test_set, 100)

# Save the sampled datasets
with open(f'{dataset_dir}/postflop_100_sample.json', 'w') as f:
    json.dump(postflop_sampled, f, indent=4)

with open(f'{dataset_dir}/preflop_100_sample.json', 'w') as f:
    json.dump(preflop_sampled, f, indent=4)

In [None]:
# Define prompt template
def create_prompt(instruction):
    return f"<|im_start|>system\nYou are a specialist in playing 6-handed No Limit Texas Holdem. You first step through your decision-making process and then answer. Your final decision must be ONE word: 'bet', 'check', 'raise', 'fold', or 'call'.<|im_end|>\n<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"

# Function to run inference with budget forcing
def run_inference_with_budget_forcing(instruction, max_tokens_thinking=MAX_TOKENS_THINKING, num_ignore=NUM_IGNORE):
    # Initialize prompt with instruction
    prompt = create_prompt(instruction)

    # Start thinking phase
    # prompt += "<|im_start|>think"
    prompt += "<|im_start|>\nThink step by step through your decision-making process. Consider factors like position, stack size, pot odds, opponent tendencies, and hand strength before arriving at a final decision.\n"


    # Get thinking tokens
    stop_tokens = tokenizer("<|im_start|><|im_end|>")["input_ids"]

    # Generate initial thinking
    outputs = model.generate(
        tokenizer([prompt], return_tensors="pt").input_ids.to(model.device),
        max_new_tokens=max_tokens_thinking,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=stop_tokens,
    )

    thinking_text = tokenizer.decode(outputs[0][len(tokenizer.encode(prompt, return_tensors="pt")[0]):], skip_special_tokens=False)
    prompt += thinking_text

    # Budget forcing loop - continue thinking if needed
    remaining_tokens = max_tokens_thinking - len(tokenizer.encode(thinking_text, return_tensors="pt")[0])

    for i in range(num_ignore):
        if remaining_tokens <= 0:
            break

        # prompt += "Wait"  # Signal to continue thinking
        prompt += "\nStep-by-step, continue reasoning: "

        # Generate more thinking
        outputs = model.generate(
            tokenizer([prompt], return_tensors="pt").input_ids.to(model.device),
            max_new_tokens=remaining_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=stop_tokens,
        )

        additional_thinking = tokenizer.decode(outputs[0][len(tokenizer.encode(prompt, return_tensors="pt")[0]):], skip_special_tokens=False)
        prompt += additional_thinking

        # Update remaining tokens
        remaining_tokens -= len(tokenizer.encode(additional_thinking, return_tensors="pt")[0])

    # Generate final answer
    # prompt += 'Final Answer:'
    # **Randomize action order**
    actions = ["bet", "check", "raise", "fold"]
    random.shuffle(actions)
    actions_str = '", "'.join(actions)  # Turn into: "check", "raise", "fold", "bet"

    # Generate final answer prompt
    prompt += "\nNow, based on your reasoning, provide a final decision in ONE word. \n\nFinal Answer: "

    # Get final answer tokens
    stop_tokens = tokenizer("<|im_end|>")["input_ids"]

    outputs = model.generate(
        tokenizer([prompt], return_tensors="pt").input_ids.to(model.device),
        max_new_tokens=10,  # Reasonable limit for the final answer
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=stop_tokens,
    )

    final_answer = tokenizer.decode(outputs[0][len(tokenizer.encode(prompt, return_tensors="pt")[0]):], skip_special_tokens=False)

    # Remove any remaining special tokens from final answer
    final_answer = final_answer.replace("<|im_end|>", "").strip()

    return {
        "full_trace": prompt + final_answer,
        "thinking": prompt.split('Final Answer:')[0].replace(create_prompt(instruction), "").replace("<|im_start|>think", ""),
        "final_answer": final_answer
    }


# Run experiment on a sample or the full dataset
def run_experiment(dataset, num_samples=10, random_seed=0):
    random.seed(random_seed)

    if num_samples > len(dataset):
        num_samples = len(dataset)

    sample_indices = random.sample(range(len(dataset)), num_samples)
    results = []

    for idx in tqdm(sample_indices):
        sample = dataset[idx]
        instruction = sample['instruction']
        ground_truth = sample['output']

        #print(f"\nEvaluating example {idx}")
        #print(f"Instruction: {instruction}")
        #print(f"Ground truth: {ground_truth}")

        # Run inference with budget forcing
        result = run_inference_with_budget_forcing(instruction)

        # Extract final answer for evaluation
        final_answer = result["final_answer"]
        #print(f"Model answer: {final_answer}")


        # Store results
        results.append({
            "idx": idx,
            "instruction": instruction,
            "ground_truth": ground_truth,
            "model_answer": final_answer,
            "thinking": result["thinking"]
        })

    return results

In [None]:
# Run experiment with specified number of samples
results = run_experiment(preflop_sampled, num_samples=5)  # Start with 5 samples

# Save results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv(f"cot_results_preflop_sampled_{MAX_TOKENS_THINKING}.csv", index=False)

# Run experiment with specified number of samples
results = run_experiment(postflop_sampled, num_samples=5)  # Start with 5 samples

# Save results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv(f"cot_results_postflop_sampled_{MAX_TOKENS_THINKING}.csv", index=False)

100%|██████████| 5/5 [01:31<00:00, 18.31s/it]
100%|██████████| 5/5 [01:36<00:00, 19.27s/it]
