In [None]:
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git@nightly git+https://github.com/unslothai/unsloth-zoo.git

Found existing installation: unsloth 2025.2.12
Uninstalling unsloth-2025.2.12:
  Successfully uninstalled unsloth-2025.2.12
Collecting git+https://github.com/unslothai/unsloth.git@nightly
  Cloning https://github.com/unslothai/unsloth.git (to revision nightly) to /tmp/pip-req-build-zmkvv6rs
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-req-build-zmkvv6rs
  Running command git checkout -b nightly --track origin/nightly
  Switched to a new branch 'nightly'
  Branch 'nightly' set up to track remote branch 'nightly' from 'origin'.
  Resolved https://github.com/unslothai/unsloth.git to commit 0c1a808e3a5828c615921fe7d3c8c10d7de6324c
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting git+https://github.com/unslothai/unsloth-zoo.git
  Cloning https://github.com/unslothai/unsloth-zoo.git to /tmp/pip-req-bui

In [None]:
!pip install bitsandbytes
!pip install accelerate
!pip install --upgrade transformers
!pip install --upgrade peft
!pip install --upgrade datasets
!pip install trl
!pip install unsloth

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [40]:
import os
import torch
import pandas as pd
from unsloth import FastLanguageModel
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset
from unsloth import is_bfloat16_supported

# LoRA hyperparameter grid
lora_r_values = [8, 16]
lora_alpha_values = [8]

# Load base model once 🔴 DON'T RELOAD INSIDE LOOP
model_name = "unsloth/Llama-3.2-3B-Instruct"
max_seq_length = 2048
dtype = None
load_in_4bit = True

# Load and split dataset 🔴 FIXED DATASET HANDLING
dataset = load_dataset('csv', data_files="/content/drive/MyDrive/Capstone Project Model Weights/dataset.csv",
                      split='train', trust_remote_code=True)
dataset = dataset.train_test_split(test_size=0.1, seed=3407)

def format_prompt(examples, tokenizer):
    prompt_style = """
    You are a data analyst bot working with a database. The table nba_roster has the following columns and metadata:

    Team: A string containing the team name (e.g., "Atlanta Hawks", "Washington Wizards").
    NAME: A string containing the player's name (e.g., "Saddiq Bey", "Corey Kispert").
    Jersey: A string containing the player's jersey number (e.g., "1", "55").
    POS: A string containing the player's position (e.g., "SF", "SG", "G").
    AGE: An integer containing the player's age (e.g., 23, 31).
    HT: A string containing the player's height in feet and inches (e.g., "6'6", "6'2").
    WT: A string containing the player's weight in pounds (e.g., "180 lbs", "185 lbs").
    COLLEGE: A string containing the player's college name (e.g., "Utah", "Toledo").
    SALARY: A string containing the player's salary, formatted with a dollar sign (e.g., "$8,195,122", "$1,719,864").

    You must generate a response that is ONLY a JSON object. The JSON object must have exactly two keys: "question" and "sql".
    DO NOT include any extra text, explanations, or commentary outside the JSON object. The JSON must begin with '{{' and end with '}}' and nothing else.

    The output of your response is directly fed to another system, and no human is reading it. Providing anything other than JSON will cause the other system to crash.
    Format your response as follows:
    {{
      "question": "<The given question>",
      "sql": "<The SQL query>"
    }}

    Example:
    Question: What are all the rows of the table in the database?
    Response:
    {{
      "question": "What are all the rows of the table in the database?",
      "sql": "SELECT * FROM nba_roster;"
    }}

    REMEMBER: DO NOT YAP. DO NOT PROVIDE ANYTHING OTHER THAN THE JSON.
    Now, answer the following question:
    Question: {}
    SQL:
    """
    formatted_texts = []
    for q, sql in zip(examples["question"], examples["sql"]):
        full_text = prompt_style.format(q) + f'\n{{\n "question": "{q}",\n "sql": "{sql}"\n}}\n' + tokenizer.eos_token
        formatted_texts.append(full_text)
    return {"text": formatted_texts}

# Load base model once 🔴 CRITICAL FIX
base_model, original_tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit
)

# 🔴 Add Llama 3 special tokens
original_tokenizer.add_special_tokens({"pad_token": "<|eot_id|>"})
original_tokenizer.add_eos_token = False
original_tokenizer.add_bos_token = False

# Format datasets 🔴 APPLY TO BOTH SPLITS
train_dataset = dataset["train"].map(format_prompt, batched=True,
                                   fn_kwargs={"tokenizer": original_tokenizer})
eval_dataset = dataset["test"].map(format_prompt, batched=True,
                                  fn_kwargs={"tokenizer": original_tokenizer})

results = []
best_metric = float("inf")
best_model_path = ""

for r in lora_r_values:
    for alpha in lora_alpha_values:
        print(f"\n--- Training with r={r}, alpha={alpha} ---")

        # 🔴 USE BASE MODEL, DON'T RELOAD
        model = FastLanguageModel.get_peft_model(
            base_model,  # 🔴 Use original base model
            r=r,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                           "gate_proj", "up_proj", "down_proj"],
            lora_alpha=alpha,
            lora_dropout=0.1,  # 🔴 Added dropout for regularization
            bias="none",
            use_gradient_checkpointing=True,  # 🔴 Use native implementation
            random_state=3407,
            use_rslora=False,
        )

        training_args = TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_ratio=0.1,  # 🔴 Better than fixed steps
            num_train_epochs=3,  # 🔴 Proper epoch-based training
            evaluation_strategy="epoch",
            learning_rate=2e-5,  # 🔴 Increased from 1e-5
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=20,
            optim="adamw_torch",
            weight_decay=0.01,
            lr_scheduler_type="cosine",
            seed=3407,
            output_dir=f"outputs_r{r}_alpha{alpha}",
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            report_to="none",
            save_strategy="epoch",
            save_total_limit=1,
        )

        trainer = SFTTrainer(
            model=model,
            tokenizer=original_tokenizer,  # 🔴 Use original tokenizer
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            dataset_text_field="text",
            max_seq_length=max_seq_length,
            args=training_args,
            packing=False,
            neftune_noise_alpha=5,
        )

        # 🔴 Proper training with evaluation
        train_results = trainer.train()
        eval_results = trainer.evaluate()

        # 🔴 Track both training and validation metrics
        results.append({
            "r": r,
            "alpha": alpha,
            "train_loss": train_results.training_loss,
            "eval_loss": eval_results["eval_loss"],
        })

        # Save only if best model
        if eval_results["eval_loss"] < best_metric:
            best_metric = eval_results["eval_loss"]
            best_model_path = f"best_model_r{r}_alpha{alpha}"
            trainer.save_model(best_model_path)
            print(f"New best model saved to {best_model_path}")

# Print results
results_df = pd.DataFrame(results)
print("\nFinal Results:")
print(results_df.sort_values(by="eval_loss"))

if best_model_path:
    print(f"\nBest model saved at: {best_model_path}")

==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Map:   0%|          | 0/281 [00:00<?, ? examples/s]

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.



--- Training with r=8, alpha=8 ---


Unsloth 2025.2.15 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


Converting train dataset to ChatML (num_proc=2):   0%|          | 0/281 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=2):   0%|          | 0/281 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/281 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=2):   0%|          | 0/281 [00:00<?, ? examples/s]

Converting eval dataset to ChatML (num_proc=2):   0%|          | 0/32 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=2):   0%|          | 0/32 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=2):   0%|          | 0/32 [00:00<?, ? examples/s]

Truncating eval dataset (num_proc=2):   0%|          | 0/32 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 281 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 105
 "-____-"     Number of trainable parameters = 12,156,928


Epoch,Training Loss,Validation Loss
1,1.5931,1.305235
2,0.8654,0.839676


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


New best model saved to best_model_r8_alpha8

--- Training with r=16, alpha=8 ---




Applying chat template to train dataset (num_proc=2):   0%|          | 0/281 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=2):   0%|          | 0/281 [00:00<?, ? examples/s]

Truncating train dataset (num_proc=2):   0%|          | 0/281 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=2):   0%|          | 0/32 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=2):   0%|          | 0/32 [00:00<?, ? examples/s]

Truncating eval dataset (num_proc=2):   0%|          | 0/32 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 281 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 105
 "-____-"     Number of trainable parameters = 24,313,856


Epoch,Training Loss,Validation Loss
1,1.5956,1.309249
2,0.8744,0.848887



Final Results:
    r  alpha  train_loss  eval_loss
0   8      8    1.175054   0.839676
1  16      8    1.180971   0.848887

Best model saved at: best_model_r8_alpha8


In [42]:
dataset['test'][0]["question"]

'Which college has produced the most NBA players'

In [24]:
print(dataset[0])

{'question': 'What college has the most players in the NBA who are 30 years old or older', 'sql': 'SELECT COLLEGE, COUNT(*) AS count FROM nba_roster WHERE AGE >= 30 GROUP BY COLLEGE ORDER BY count DESC LIMIT 1;', 'text': '\n    You are a data analyst bot working with a database. The table nba_roster has the following columns and metadata:\n\n    Team: A string containing the team name (e.g., "Atlanta Hawks", "Washington Wizards").\n    NAME: A string containing the player\'s name (e.g., "Saddiq Bey", "Corey Kispert").\n    Jersey: A string containing the player\'s jersey number (e.g., "1", "55").\n    POS: A string containing the player\'s position (e.g., "SF", "SG", "G").\n    AGE: An integer containing the player\'s age (e.g., 23, 31).\n    HT: A string containing the player\'s height in feet and inches (e.g., "6\'6", "6\'2").\n    WT: A string containing the player\'s weight in pounds (e.g., "180 lbs", "185 lbs").\n    COLLEGE: A string containing the player\'s college name (e.g., "

##Inference Code

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
!pip install unsloth transformers peft accelerate bitsandbytes

Collecting unsloth
  Downloading unsloth-2025.2.15-py3-none-any.whl.metadata (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting unsloth_zoo>=2025.2.7 (from unsloth)
  Downloading unsloth_zoo-2025.2.7-py3-none-any.whl.metadata (16 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.16-py3-none-any.whl.metadata (9.4 kB)
Collecting datasets>=2.16.0 (from unsloth)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth)
  Downloading trl-0.15.2-py3-none-an

In [1]:
from unsloth import FastLanguageModel
from peft import PeftModel
from transformers import AutoTokenizer
import torch
import json

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [43]:
base_model_name = "unsloth/Llama-3.2-3B-Instruct"
adapter_path = "/content/best_model_r8_alpha8"  # Replace with your path
max_seq_length = 2048

In [44]:
# Load base model
base_model, _ = FastLanguageModel.from_pretrained(
    model_name=base_model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
)

==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [45]:
# Load trained adapter
model = PeftModel.from_pretrained(base_model, adapter_path)

In [46]:
# Load tokenizer from saved directory
tokenizer = AutoTokenizer.from_pretrained(adapter_path)

In [53]:
def generate_response(question):
    # Format prompt using the same template as training
    prompt_template = """
    You are a data analyst bot working with a database. The table nba_roster has the following columns and metadata:

    Team: A string containing the team name (e.g., "Atlanta Hawks", "Washington Wizards").
    NAME: A string containing the player's name (e.g., "Saddiq Bey", "Corey Kispert").
    Jersey: A string containing the player's jersey number (e.g., "1", "55").
    POS: A string containing the player's position (e.g., "SF", "SG", "G").
    AGE: An integer containing the player's age (e.g., 23, 31).
    HT: A string containing the player's height in feet and inches (e.g., "6'6", "6'2").
    WT: A string containing the player's weight in pounds (e.g., "180 lbs", "185 lbs").
    COLLEGE: A string containing the player's college name (e.g., "Utah", "Toledo").
    SALARY: A string containing the player's salary, formatted with a dollar sign (e.g., "$8,195,122", "$1,719,864").

    You must generate a response that is ONLY a JSON object. The JSON object must have exactly two keys: "question" and "sql".
    DO NOT include any extra text, explanations, or commentary outside the JSON object. The JSON must begin with '{{' and end with '}}' and nothing else.

    The output of your response is directly fed to another system, and no human is reading it. Providing anything other than JSON will cause the other system to crash.
    Format your response as follows:
    {{
      "question": "<The given question>",
      "sql": "<The SQL query>"
    }}

    Example:
    Question: What are all the rows of the table in the database?
    Response:
    {{
      "question": "What are all the rows of the table in the database?",
      "sql": "SELECT * FROM nba_roster;"
    }}

    REMEMBER: DO NOT YAP. DO NOT PROVIDE ANYTHING OTHER THAN THE JSON.
    Now, answer the following question:
    Question: {}
    SQL:
    """
    # prompt_template = """[Your exact prompt template from training code]"""
    formatted_prompt = prompt_template.format(question) + tokenizer.eos_token

    # Tokenize input
    inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_seq_length,
    ).to("cuda")

    FastLanguageModel.for_inference(model)

    # Generate response
    outputs = model.generate(
        **inputs,
        max_new_tokens=200,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.7,
        do_sample=True,
    )

    # Decode and extract JSON
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    json_start = full_response.find('{')
    json_end = full_response.rfind('}') + 1

    if json_start == -1 or json_end == 0:
        return {"error": "No JSON found in response"}, full_response

    try:
        return json.loads(full_response[json_start:json_end]), full_response
    except json.JSONDecodeError:
        return {"error": "Invalid JSON format"}, full_response

In [58]:
#Eg 1 (Actual test dataset)
question = "How old is Lebron James?"
response,raw_output = generate_response(question)

print("Raw model output:")
print(raw_output)
print("\nParsed response:")
print(response)

Raw model output:

    You are a data analyst bot working with a database. The table nba_roster has the following columns and metadata:

    Team: A string containing the team name (e.g., "Atlanta Hawks", "Washington Wizards").
    NAME: A string containing the player's name (e.g., "Saddiq Bey", "Corey Kispert").
    Jersey: A string containing the player's jersey number (e.g., "1", "55").
    POS: A string containing the player's position (e.g., "SF", "SG", "G").
    AGE: An integer containing the player's age (e.g., 23, 31).
    HT: A string containing the player's height in feet and inches (e.g., "6'6", "6'2").
    WT: A string containing the player's weight in pounds (e.g., "180 lbs", "185 lbs").
    COLLEGE: A string containing the player's college name (e.g., "Utah", "Toledo").
    SALARY: A string containing the player's salary, formatted with a dollar sign (e.g., "$8,195,122", "$1,719,864").

    You must generate a response that is ONLY a JSON object. The JSON object must have

In [57]:
#Eg 2 (ficticious team)
question = "What are the names of all players on the Atlanta Hawks?"
response,raw_output = generate_response(question)

print("Raw model output:")
print(raw_output)
print("\nParsed response:")
print(response)

Raw model output:

    You are a data analyst bot working with a database. The table nba_roster has the following columns and metadata:

    Team: A string containing the team name (e.g., "Atlanta Hawks", "Washington Wizards").
    NAME: A string containing the player's name (e.g., "Saddiq Bey", "Corey Kispert").
    Jersey: A string containing the player's jersey number (e.g., "1", "55").
    POS: A string containing the player's position (e.g., "SF", "SG", "G").
    AGE: An integer containing the player's age (e.g., 23, 31).
    HT: A string containing the player's height in feet and inches (e.g., "6'6", "6'2").
    WT: A string containing the player's weight in pounds (e.g., "180 lbs", "185 lbs").
    COLLEGE: A string containing the player's college name (e.g., "Utah", "Toledo").
    SALARY: A string containing the player's salary, formatted with a dollar sign (e.g., "$8,195,122", "$1,719,864").

    You must generate a response that is ONLY a JSON object. The JSON object must have

In [50]:
# Another type of code for getting output
# Eg1 (A question from training dataset)
from transformers import TextStreamer
prompt_template = """
    You are a data analyst bot working with a database. The table nba_roster has the following columns and metadata:

    Team: A string containing the team name (e.g., "Atlanta Hawks", "Washington Wizards").
    NAME: A string containing the player's name (e.g., "Saddiq Bey", "Corey Kispert").
    Jersey: A string containing the player's jersey number (e.g., "1", "55").
    POS: A string containing the player's position (e.g., "SF", "SG", "G").
    AGE: An integer containing the player's age (e.g., 23, 31).
    HT: A string containing the player's height in feet and inches (e.g., "6'6", "6'2").
    WT: A string containing the player's weight in pounds (e.g., "180 lbs", "185 lbs").
    COLLEGE: A string containing the player's college name (e.g., "Utah", "Toledo").
    SALARY: A string containing the player's salary, formatted with a dollar sign (e.g., "$8,195,122", "$1,719,864").

    You must generate a response that is ONLY a JSON object. The JSON object must have exactly two keys: "question" and "sql".
    DO NOT include any extra text, explanations, or commentary outside the JSON object. The JSON must begin with '{{' and end with '}}' and nothing else.

    The output of your response is directly fed to another system, and no human is reading it. Providing anything other than JSON will cause the other system to crash.
    Format your response as follows:
    {{
      "question": "<The given question>",
      "sql": "<The SQL query>"
    }}

    Example:
    Question: What are all the rows of the table in the database?
    Response:
    {{
      "question": "What are all the rows of the table in the database?",
      "sql": "SELECT * FROM nba_roster;"
    }}

    REMEMBER: DO NOT YAP. DO NOT PROVIDE ANYTHING OTHER THAN THE JSON.
    Now, answer the following question:
    Question: What is the average age of all players in the NBA
    SQL:
    """
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
formatted_prompt = prompt_template.format(question) + tokenizer.eos_token
inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_seq_length,
    ).to("cuda")
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)

<|begin_of_text|>
    You are a data analyst bot working with a database. The table nba_roster has the following columns and metadata:

    Team: A string containing the team name (e.g., "Atlanta Hawks", "Washington Wizards").
    NAME: A string containing the player's name (e.g., "Saddiq Bey", "Corey Kispert").
    Jersey: A string containing the player's jersey number (e.g., "1", "55").
    POS: A string containing the player's position (e.g., "SF", "SG", "G").
    AGE: An integer containing the player's age (e.g., 23, 31).
    HT: A string containing the player's height in feet and inches (e.g., "6'6", "6'2").
    WT: A string containing the player's weight in pounds (e.g., "180 lbs", "185 lbs").
    COLLEGE: A string containing the player's college name (e.g., "Utah", "Toledo").
    SALARY: A string containing the player's salary, formatted with a dollar sign (e.g., "$8,195,122", "$1,719,864").

    You must generate a response that is ONLY a JSON object. The JSON object must have 

In [59]:
# Another type of code for getting output
# Eg2
from transformers import TextStreamer
prompt_template = """
    You are a data analyst bot working with a database. The table nba_roster has the following columns and metadata:

    Team: A string containing the team name (e.g., "Atlanta Hawks", "Washington Wizards").
    NAME: A string containing the player's name (e.g., "Saddiq Bey", "Corey Kispert").
    Jersey: A string containing the player's jersey number (e.g., "1", "55").
    POS: A string containing the player's position (e.g., "SF", "SG", "G").
    AGE: An integer containing the player's age (e.g., 23, 31).
    HT: A string containing the player's height in feet and inches (e.g., "6'6", "6'2").
    WT: A string containing the player's weight in pounds (e.g., "180 lbs", "185 lbs").
    COLLEGE: A string containing the player's college name (e.g., "Utah", "Toledo").
    SALARY: A string containing the player's salary, formatted with a dollar sign (e.g., "$8,195,122", "$1,719,864").

    You must generate a response that is ONLY a JSON object. The JSON object must have exactly two keys: "question" and "sql".
    DO NOT include any extra text, explanations, or commentary outside the JSON object. The JSON must begin with '{{' and end with '}}' and nothing else.

    The output of your response is directly fed to another system, and no human is reading it. Providing anything other than JSON will cause the other system to crash.
    Format your response as follows:
    {{
      "question": "<The given question>",
      "sql": "<The SQL query>"
    }}

    Example:
    Question: What are all the rows of the table in the database?
    Response:
    {{
      "question": "What are all the rows of the table in the database?",
      "sql": "SELECT * FROM nba_roster;"
    }}

    REMEMBER: DO NOT YAP. DO NOT PROVIDE ANYTHING OTHER THAN THE JSON.
    Now, answer the following question:
    Question: What's the average age of the Trail Blazers?
    SQL:
    """
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
formatted_prompt = prompt_template.format(question) + tokenizer.eos_token
inputs = tokenizer(
        formatted_prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_seq_length,
    ).to("cuda")
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64)

<|begin_of_text|>
    You are a data analyst bot working with a database. The table nba_roster has the following columns and metadata:

    Team: A string containing the team name (e.g., "Atlanta Hawks", "Washington Wizards").
    NAME: A string containing the player's name (e.g., "Saddiq Bey", "Corey Kispert").
    Jersey: A string containing the player's jersey number (e.g., "1", "55").
    POS: A string containing the player's position (e.g., "SF", "SG", "G").
    AGE: An integer containing the player's age (e.g., 23, 31).
    HT: A string containing the player's height in feet and inches (e.g., "6'6", "6'2").
    WT: A string containing the player's weight in pounds (e.g., "180 lbs", "185 lbs").
    COLLEGE: A string containing the player's college name (e.g., "Utah", "Toledo").
    SALARY: A string containing the player's salary, formatted with a dollar sign (e.g., "$8,195,122", "$1,719,864").

    You must generate a response that is ONLY a JSON object. The JSON object must have 