In [None]:
import json
import os
from sklearn.model_selection import train_test_split

In [None]:
# Read test set as pre-flop and post-flop
cwd = os.getcwd()
dataset_dir = f"{cwd}/Data"

with open(f'{dataset_dir}/postflop_10k_test_set_prompt_and_label.json', 'r') as f:
  postflop_test_set = json.load(f)

with open(f'{dataset_dir}/preflop_1k_test_set_prompt_and_label.json', 'r') as f:
  preflop_test_set = json.load(f)

In [None]:
# Sample 100 instances from each of the datasets

# Function to categorize actions
def categorize_action(output):
    if 'bet' in output:
        return 'bet'
    elif 'raise' in output:
        return 'raise'
    elif 'check' in output:
        return 'check'
    elif 'fold' in output:
        return 'fold'
    else:
        return 'other'

# Function to perform stratified sampling over action categories
def stratified_sample(dataset, sample_size=100):
    # Assign category labels
    action_categories = [categorize_action(item['output']) for item in dataset]

    # Perform stratified sampling
    _, sampled_indices = train_test_split(
        range(len(dataset)),
        test_size=sample_size,
        stratify=action_categories,
        random_state=42
    )

    # Select sampled examples
    sampled_dataset = [dataset[i] for i in sampled_indices]

    return sampled_dataset

# Sample 100 examples from each dataset
postflop_sampled = stratified_sample(postflop_test_set, 100)
preflop_sampled = stratified_sample(preflop_test_set, 100)

# Save the sampled datasets
with open(f'{dataset_dir}/postflop_100_sample.json', 'w') as f:
    json.dump(postflop_sampled, f, indent=4)

with open(f'{dataset_dir}/preflop_100_sample.json', 'w') as f:
    json.dump(preflop_sampled, f, indent=4)