In [None]:
!pip install -q -U transformers
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U peft
!pip install -q -i https://pypi.org/simple/ bitsandbytes
!pip install -q -U trl
!pip install rouge-score

In [1]:
from huggingface_hub import login

hf_token = "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"

login(hf_token)

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import torch
import torch.nn as nn

import transformers
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments, # Note: SFTConfig from TRL is used later
                          pipeline,
                          logging)

# Explicitly import Gemma3ForCausalLM
from transformers.models.gemma3 import Gemma3ForCausalLM

from datasets import Dataset
from peft import LoraConfig, PeftConfig, PeftModel

import bitsandbytes as bnb

from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)

from sklearn.model_selection import train_test_split

# Check transformers version
print(f"transformers=={transformers.__version__}")

2025-05-18 20:10:18.443262: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747599018.466553     482 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747599018.473466     482 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


transformers==4.51.3


In [5]:
def define_device():
    """Determine and return the optimal PyTorch device based on availability."""

    print(f"PyTorch version: {torch.__version__}", end=" -- ")

    # Check if MPS (Metal Performance Shaders) is available for macOS
    if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
        print("using MPS device on macOS")
        return torch.device("mps")

    # Check for CUDA availability
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"using {device}")
    return device

In [6]:
# Determine optimal computation dtype based on GPU capability
# Use bfloat16 if Compute Capability >= 8.0, otherwise float16
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] >= 8 else torch.float16
print(f"Using compute dtype {compute_dtype}")

# Select the best available device (CPU, CUDA, or MPS)
device = define_device()
print(f"Operating on {device}")

# Path to the pre-trained model (adjust if necessary)
GEMMA_PATH = "/kaggle/input/gemma-3/transformers/gemma-3-1b-it/1"

# Load the model with optimized settings
model = Gemma3ForCausalLM.from_pretrained(
    GEMMA_PATH,
    torch_dtype=compute_dtype,
    attn_implementation="eager", # Specify attention implementation
    low_cpu_mem_usage=True,      # Reduces CPU RAM usage during loading
    device_map=device            # Automatically map model layers to the device
)

# Define maximum sequence length for the tokenizer
max_seq_length = 8192 # Gemma 3 supports long contexts

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    GEMMA_PATH,
    max_seq_length=max_seq_length,
    device_map=device # Map tokenizer operations if relevant (less common)
)

# Store the EOS token for later use in prompts
EOS_TOKEN = tokenizer.eos_token

Using compute dtype torch.float16
PyTorch version: 2.6.0+cu124 -- using cuda
Operating on cuda


In [7]:
# Check if all model parameters are on the CUDA device
is_on_gpu = all(param.device.type == 'cuda' for param in model.parameters())
print("Model is on GPU:", is_on_gpu)

Model is on GPU: True


In [8]:
def apply_qa_template_train(row):
    return f"""<|system|>
Ты помощник, который помогает переводить вопросы на русском языке в SQL-запросы к базе данных.</s>
<|user|>
Контекст таблицы:
{row['context']}

Вопрос:
{row['translated_question']}</s>
<|assistant|>
{row['answer']}"""

def transform_train_data(data):
  return apply_qa_template_train(data)

In [9]:
def apply_qa_template_eval(row):
    return f"""<|system|>
Ты — специалист по базам данных. На основе контекста и вопроса на русском языке сгенерируй только SQL-запрос. Никаких пояснений, только SQL.</s>
<|user|>
Контекст таблицы:
{row['context']}

Запрос:
{row['translated_question']}</s>
<|assistant|>
"""

def transform_eval_data(data):
  return apply_qa_template_eval(data)

In [10]:
data = pd.read_csv("/kaggle/input/dataset/data_translated.csv")

In [11]:
data.head()

Unnamed: 0,context,question,answer,translated_question
0,CREATE TABLE head (age INTEGER),How many heads of the departments are older th...,SELECT COUNT(*) FROM head WHERE age > 56,Сколько руководителей департаментов старше 56 ...
1,"CREATE TABLE head (name VARCHAR, born_state VA...","List the name, born state and age of the heads...","SELECT name, born_state, age FROM head ORDER B...","Укажите фамилию, имя, отчество и возраст руков..."
2,"CREATE TABLE department (creation VARCHAR, nam...","List the creation year, name and budget of eac...","SELECT creation, name, budget_in_billions FROM...","Укажите год создания, название и бюджет каждог..."
3,CREATE TABLE department (budget_in_billions IN...,What are the maximum and minimum budget of the...,"SELECT MAX(budget_in_billions), MIN(budget_in_...",Каковы максимальный и минимальный бюджет депар...
4,CREATE TABLE department (num_employees INTEGER...,What is the average number of employees of the...,SELECT AVG(num_employees) FROM department WHER...,Каково среднее число сотрудников департаментов...


In [12]:
train_df, eval_df = train_test_split(data, test_size=0.1, random_state=77)

In [13]:
train_df["text"] = train_df.apply(transform_train_data, axis=1)

In [14]:
eval_df["text"] = eval_df.apply(transform_eval_data, axis=1)

In [15]:
train_dataset = Dataset.from_pandas(train_df[["text"]]).select(range(10000))
eval_dataset = Dataset.from_pandas(eval_df[["text"]]).select(range(1000))

In [16]:
print(eval_df.iloc[0]["text"])

<|system|>
Ты — специалист по базам данных. На основе контекста и вопроса на русском языке сгенерируй только SQL-запрос. Никаких пояснений, только SQL.</s>
<|user|>
Контекст таблицы:
CREATE TABLE table_name_5 (date VARCHAR, opponents VARCHAR)

Запрос:
Турнир с соперницей Келли Де Бир Евой Пера был сыгран в какой день?
</s>
<|assistant|>



In [17]:
print(train_df.iloc[0]["text"])

<|system|>
Ты помощник, который помогает переводить вопросы на русском языке в SQL-запросы к базе данных.</s>
<|user|>
Контекст таблицы:
CREATE TABLE table_name_97 (crowd INTEGER, home_team VARCHAR)

Вопрос:
Какая самая маленькая толпа была на домашнем матче "Эссендона"?
</s>
<|assistant|>
SELECT MIN(crowd) FROM table_name_97 WHERE home_team = "essendon"


In [18]:
def predict_sql(eval_df, model, tokenizer, device="cuda", max_new_tokens=128, temperature=0.0):
    """Генерация SQL-запросов по текстовому промпту в формате ChatML."""
    predicted_sql = []

    model.eval()

    for i in tqdm(range(len(eval_df)), desc="Генерация SQL"):
        prompt = eval_df[i]["text"]  # поле без gold answer

        # Токенизация
        input_ids = tokenizer(prompt, return_tensors="pt").to(device)

        # Генерация
        with torch.no_grad():
            outputs = model.generate(
                **input_ids,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                do_sample=False
            )

        # Декодирование всего текста
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Извлекаем только SQL часть (после <|assistant|>)
        if "<|assistant|>" in decoded:
            predicted = decoded.split("<|assistant|>")[-1].strip()
        else:
            predicted = decoded.strip()

        predicted_sql.append(predicted)

    return predicted_sql

In [19]:
# eval_predicted = predict_sql(eval_df=eval_dataset, model=model, tokenizer=tokenizer)

In [20]:
print(eval_df.iloc[3]["text"])

<|system|>
Ты — специалист по базам данных. На основе контекста и вопроса на русском языке сгенерируй только SQL-запрос. Никаких пояснений, только SQL.</s>
<|user|>
Контекст таблицы:
CREATE TABLE table_24172157_3 (date_of_vacancy VARCHAR, table VARCHAR, team VARCHAR)

Запрос:
Какова дата освобождения места для команды "Ливерпуль", таблица которой называется предсезонной?
</s>
<|assistant|>



In [None]:
print(eval_df.iloc[3]["answer"])

In [None]:
eval_predicted[3]

In [None]:
eval_predicted

In [21]:
import re

def clean_sql(text):
    """
    Удаляет обёртки ```sql ... ``` и обрезает по первой строке, если нужно.
    """
    if not isinstance(text, str):
        return ""

    # Убираем блоки ```sql и ```
    cleaned = re.sub(r"```sql\s*", "", text, flags=re.IGNORECASE)
    cleaned = re.sub(r"```", "", cleaned)
    return cleaned.strip()

In [None]:
clean_predicts = list(map(clean_sql, eval_predicted))

In [None]:
answers = eval_df.head(50)["answer"].to_list()

In [22]:
from rouge_score import rouge_scorer
import numpy as np

def compute_rouge_scores(predictions, references, use_stemmer=True):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=use_stemmer)

    scores = {"rouge1": [], "rouge2": [], "rougeL": []}

    for pred, ref in zip(predictions, references):
        result = scorer.score(ref, pred)
        for key in scores:
            scores[key].append(result[key].fmeasure)

    return {
        "ROUGE-1": round(np.mean(scores["rouge1"]), 4),
        "ROUGE-2": round(np.mean(scores["rouge2"]), 4),
        "ROUGE-L": round(np.mean(scores["rougeL"]), 4),
    }

In [None]:
compute_rouge_scores(clean_predicts, answers)

In [23]:
from transformers import TrainingArguments
from peft import LoraConfig

# LoRA Configuration
peft_config = LoraConfig(
    lora_alpha=16,                           # Scaling factor for LoRA
    lora_dropout=0.05,                       # Add slight dropout for regularization
    r=64,                                    # Rank of the LoRA update matrices
    bias="none",                             # No bias reparameterization
    task_type="CAUSAL_LM",                   # Task type: Causal Language Modeling
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],  # Target modules for LoRA
)


In [24]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  
)

In [25]:
training_arguments = TrainingArguments(
    output_dir="output",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    optim="adamw_torch",
    num_train_epochs=2,
    logging_steps=0.2,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    group_by_length=True,
    report_to="none",
    remove_unused_columns=False, 
)

In [26]:
def tokenize(example):
    out = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=1024,
    )
    out["labels"] = out["input_ids"].copy()
    return out

tokenized_train = train_dataset.map(tokenize, remove_columns=["text"])
tokenized_eval = eval_dataset.map(tokenize, remove_columns=["text"])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [27]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [2]:
trainer.train()

NameError: name 'trainer' is not defined

In [None]:
model.save_pretrained("gemma3:1b-text2sql-lora")
tokenizer.save_pretrained("gemma3:1b-text2sql-lora")

In [None]:
from huggingface_hub import HfApi

# Название репозитория
repo_name = "your-username/gemma3:1b-text2sql-lora"

# Создаём репозиторий (если ещё нет)
api = HfApi()
api.create_repo(repo_name, repo_type="model", private=False)  # или private=True

# Загружаем
from huggingface_hub import upload_folder

upload_folder(
    repo_id=repo_name,
    folder_path="gemma3-text2sql-lora",
    repo_type="model"
)

In [None]:
# model.push_to_hub("Arsench1k/gemma3:1b-text2sql-lora")
# tokenizer.push_to_hub("Arsench1k/gemma3:1b-text2sql-lora")