In [None]:
from google.colab import drive
drive.mount('/content/drive')
cwd = '/content/drive/MyDrive/0258_poker_project_personal'

Mounted at /content/drive


In [None]:
# Installation
%%capture
# Normally using pip install unsloth is enough

# Temporarily as of Jan 31st 2025, Colab has some issues with Pytorch
# Using pip install unsloth will take 3 minutes, whilst the below takes <1 minute:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29 peft trl triton
!pip install --no-deps cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

In [None]:
import random
import torch
import csv
from tqdm import tqdm
import pandas as pd

In [None]:
from unsloth import FastLanguageModel

max_seq_length = 4096

model_name = "Meta-Llama-3.2-3B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",  # change this for different models
    # model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = None,  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True,  # to reduce memory usage
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.9: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [None]:
# add LoRA adapters - we aren't finetuning but it seems like this is still needed for inference..???

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.3.9 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [None]:
# load data from https://huggingface.co/datasets/RZ412/PokerBench. format is {'instruction': ..., 'output': ...}.

from datasets import load_dataset
import json

EOS_TOKEN = tokenizer.eos_token

train_ds = load_dataset("RZ412/PokerBench", split="train")

# read test set as pre-flop and post-flop
dataset_dir = f"{cwd}/datasets"

with open(f'{dataset_dir}/postflop_10k_test_set_prompt_and_label.json', 'r') as f:
  postflop_test_set = json.load(f)

with open(f'{dataset_dir}/preflop_1k_test_set_prompt_and_label.json', 'r') as f:
  preflop_test_set = json.load(f)

README.md:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

(…)lop_500k_train_set_prompt_and_label.json:   0%|          | 0.00/561M [00:00<?, ?B/s]

(…)flop_60k_train_set_prompt_and_label.json:   0%|          | 0.00/59.2M [00:00<?, ?B/s]

(…)tflop_10k_test_set_prompt_and_label.json:   0%|          | 0.00/11.2M [00:00<?, ?B/s]

(…)reflop_1k_test_set_prompt_and_label.json:   0%|          | 0.00/921k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/563200 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11000 [00:00<?, ? examples/s]

In [None]:
# function to craft a k-shot prompt.

def compose_random_fewshot_prompt(query, k=1):
  examples = []
  for _ in range(k):
    example = train_ds[random.randrange(len(train_ds))]
    examples.append(f"{example['instruction']} {example['output']}")

  prompt = "\n".join(examples)
  prompt += f"\n {query}"

  return prompt

In [None]:
# functions to run inference on one prompt

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

def clean_outputs(output):
  # calling generate on the llama model seems to always return a certain format
  # that includes the prompt so we have to extract the output.
  output = output.replace(EOS_TOKEN, "")

  output_only = output.split("<|start_header_id|>assistant<|end_header_id|>")[-1]
  output_only = output_only.strip()
  return output_only


def run_inference_on_one_prompt(prompt):
  messages = [
      {"role": "user", "content": prompt},
  ]
  inputs = tokenizer.apply_chat_template(
      messages,
      tokenize = True,
      add_generation_prompt = True, # Must add for generation
      return_tensors = "pt",
  ).to("cuda")

  outputs = tokenizer.batch_decode(
      model.generate(input_ids = inputs, max_new_tokens = 8, use_cache = True,
                     temperature = 1.5, min_p = 0.1)
  )

  cleaned_outputs = map(clean_outputs, outputs)
  return list(cleaned_outputs)[0]

In [None]:
# function to run inference on the validation set and store the results in a csv. not batched.

def run_testset_inference(test_set, division):
  # division = "preflop" or "postflop"

  csv_name = f"{cwd}/fewshot_{model_name}_{division}_predictions.csv"

  with open(csv_name, 'w') as preds_file:
    writer = csv.writer(preds_file)
    writer.writerow(["Ground Truth", "Prediction"])  # name the columns this for compatability w evaluation code

    for i in tqdm(range(len(test_set))):
      query = test_set[i]
      prompt = compose_random_fewshot_prompt(query=query["instruction"], k=2)
      pred = run_inference_on_one_prompt(prompt)

      writer.writerow([query["output"], pred])

In [None]:
run_testset_inference(postflop_test_set, "postflop")

100%|██████████| 10000/10000 [1:22:37<00:00,  2.02it/s]
