In [5]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps peft accelerate bitsandbytes transformers xformers
pass

In [3]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
from peft import PeftConfig, PeftModel

# using transformers

In [4]:
PEFT_MODEL = "adnaan525/div_last_move_5000_chess"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
config = PeftConfig.from_pretrained(PEFT_MODEL)
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

tokenizer=AutoTokenizer.from_pretrained(config.base_model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

model_falcon = PeftModel.from_pretrained(model, PEFT_MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

In [8]:
prompt_question_1 = "in a paragraph, explain the rationale behind the last move, where all previous moves are:"
prompt_question_2 = ", and last move is "
test_move = "e4 c5 Nf3 Nc6 Bc4 g6 Ng5 Ne5 Bb3 h6 Nf3 Bg7 Nxe5 Bxe5 O-O Qc7 Qf3 Nf6 h3 h5 c3 O-O d3 Nh7 g4 hxg4 Qxg4 d6 Qxg6+ Kh8 Bxf7 Bxh3 Re1 Rg8 Bxg8 Rxg8 Qxg8+ Kxg8 Re3"
test_last_move = "Qc8"

setup = f"""
<human>: {prompt_question_1 + test_move + prompt_question_2 + test_last_move}
<assistant>:
"""

In [9]:
prompt = setup.strip()
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [10]:
%%time
device = "cuda:0"

encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config
  )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))



<human>: in a paragraph, explain the rationale behind the last move, where all previous moves are:e4 c5 Nf3 Nc6 Bc4 g6 Ng5 Ne5 Bb3 h6 Nf3 Bg7 Nxe5 Bxe5 O-O Qc7 Qf3 Nf6 h3 h5 c3 O-O d3 Nh7 g4 hxg4 Qxg4 d6 Qxg6+ Kh8 Bxf7 Bxh3 Re1 Rg8 Bxg8 Rxg8 Qxg8+ Kxg8 Re3, and last move is Qc8
<assistant>: The last move, Qc8, was likely made to further develop the queen and put pressure on the opponent's position. By placing the queen on c8, it controls the central squares and puts pressure on the opponent's king side. Additionally, it also prepares for potential future attacks on the opponent's king side. Overall, the move Qc8 is a strategic decision to improve the position of the queen and increase the pressure on the opponent's position.
CPU times: user 13.8 s, sys: 734 ms, total: 14.6 s
Wall time: 20.3 s


# using unsloth

In [6]:
from unsloth import FastLanguageModel

In [12]:
import zipfile
import os

def unzip_file(zip_file, extract_to):
    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Example usage
zip_file = "gemma_model_only_ai_gen.zip"
extract_to = ''

unzip_file(zip_file, extract_to)


In [13]:
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "gemma_model_only_ai_gen",
        max_seq_length = 2048,
        dtype = None,
        load_in_4bit = True,
    )

config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Gemma patching release 2024.3
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. Xformers = 0.0.25.post1. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.57G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Unsloth 2024.3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [14]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [17]:
import textwrap
def process_response(response):
    lines = response.replace("<bos>", "").replace("<eos>", "").split("\n")
    wrapped_lines = [textwrap.fill(line, width=100) for line in lines]
    for wrapped_line in wrapped_lines:
        print(wrapped_line)

prompt_chess = """
Instruction:{}; previous moves:{}; last move:{}.
Response:{}"""

In [19]:
# black takes queen to c8 to protect bishop from white rook
inputs = tokenizer(
[
    prompt_chess.format(
        "in a paragraph, explain the rationale behind the last move, where all previous moves are",
        "e4 c5 Nf3 Nc6 Bc4 g6 Ng5 Ne5 Bb3 h6 Nf3 Bg7 Nxe5 Bxe5 O-O Qc7 Qf3 Nf6 h3 h5 c3 O-O d3 Nh7 g4 hxg4 Qxg4 d6 Qxg6+ Kh8 Bxf7 Bxh3 Re1 Rg8 Bxg8 Rxg8 Qxg8+ Kxg8 Re3",
        "Qc8",
        ""
    )
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 512, use_cache = True)
process_response(tokenizer.decode(outputs[0]))



Instruction:in a paragraph, explain the rationale behind the last move, where all previous moves
are; previous moves:e4 c5 Nf3 Nc6 Bc4 g6 Ng5 Ne5 Bb3 h6 Nf3 Bg7 Nxe5 Bxe5 O-O Qc7 Qf3 Nf6 h3 h5 c3
O-O d3 Nh7 g4 hxg4 Qxg4 d6 Qxg6+ Kh8 Bxf7 Bxh3 Re1 Rg8 Bxg8 Rxg8 Qxg8+ Kxg8 Re3; last move:Qc8.
Response:The last move, Qc8, was likely made to protect the bishop on h3 and prevent any potential
threats from the opponent's pieces. By moving the queen to c8, the player is also preparing to
potentially launch an attack on the opponent's king side. Additionally, the queen on c8 is now in a
more centralized position, allowing for better control of the board and potential future tactical
opportunities. Overall, the move Qc8 was a strategic decision to improve the player's position and
maintain control of the game.
