In [29]:
import os
import sys

from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset
from torch.utils.data import Subset

# Setup module path for local imports
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


import importlib

In [30]:
import data.sudoku
import data.format

importlib.reload(data.sudoku)
importlib.reload(data.format)

from data.sudoku import Sudoku
from data.format import (
    chat_format_qa_instance,
    lm_format_qa_instance,
    chat_create_fewshot_prompt,
)
# Load environment variables
load_dotenv()

MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"

In [31]:
def load_prep_sudoku_dataset(
    tokenizer, instruction_tuned=True, few_shot=None, test_split_size=0.2
):
    dataset = Sudoku(data_file=os.environ["SUDOKU_PATH"])

    if instruction_tuned:
        formatted_list = []
        if few_shot is not None:
            # Pick examples for few-shot learning
            fewshot_examples = Subset(dataset, range(few_shot))
            dataset = Subset(dataset, range(few_shot, len(dataset)))
            print(f"Few-shot examples: {len(fewshot_examples)}")
            print(f"Remaining examples: {len(dataset)}")

            formatted_list = [
                chat_create_fewshot_prompt(
                    qa_instance=example, examples=fewshot_examples, num_shots=few_shot
                )
                for example in dataset
            ]
        else:
            formatted_list = [chat_format_qa_instance(example) for example in dataset]
        formatted_list = tokenizer.apply_chat_template(
            formatted_list, tokenize=False, add_generation_prompt=False
        )

    else:
        formatted_list = [lm_format_qa_instance(example) for example in dataset]

    dataset = Dataset.from_dict({"formatted_text": formatted_list})

    dataset = dataset.train_test_split(test_size=test_split_size)

    return dataset

In [32]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=os.environ["HF_TOKEN"])
tokenizer.pad_token = tokenizer.eos_token

In [33]:
load_prep_sudoku_dataset(tokenizer, instruction_tuned=True, few_shot=None)

qa_instance: {'question': 'Given the Sudoku puzzle 1..5.37..6.3..8.9......98...1.......8761..........6...........7.8.9.76.47...6.312, which has 27 clues and a difficulty rating of 2.2. Please solve for the final arrangement.', 'answer': '198543726643278591527619843914735268876192435235486179462351987381927654759864312', 'num_clues': np.int64(27)}
Given the Sudoku puzzle 1..5.37..6.3..8.9......98...1.......8761..........6...........7.8.9.76.47...6.312, which has 27 clues and a difficulty rating of 2.2. Please solve for the final arrangement.
qa_instance: {'question': 'Given the Sudoku puzzle ...81.....2........1.9..7...7..25.934.2............5...975.....563.....4......68., which has 23 clues and a difficulty rating of 0.0. Please solve for the final arrangement.', 'answer': '934817256728653419615942738176425893452398167389176542897564321563281974241739685', 'num_clues': np.int64(23)}
Given the Sudoku puzzle ...81.....2........1.9..7...7..25.934.2............5...975.....563.....4......68

DatasetDict({
    train: Dataset({
        features: ['formatted_text'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['formatted_text'],
        num_rows: 2000
    })
})