In [11]:
from datetime import datetime
import os
import sys

import torch
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import Dataset


In [9]:
import json

def read_json_file(file_path):
    
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        raise FileNotFoundError(f"The file '{file_path}' does not exist.")
    except json.JSONDecodeError as e:
        raise json.JSONDecodeError(f"Invalid JSON in file '{file_path}': {e}", e.doc, e.pos)

In [13]:
file_path = "/home/yyj/Desktop/yyj/thesis/code/PETSQL/database/spider_data/train_spider.json"
train_data = read_json_file(file_path)


[{'db_id': 'department_management', 'query': 'SELECT count(*) FROM head WHERE age  >  56', 'query_toks': ['SELECT', 'count', '(', '*', ')', 'FROM', 'head', 'WHERE', 'age', '>', '56'], 'query_toks_no_value': ['select', 'count', '(', '*', ')', 'from', 'head', 'where', 'age', '>', 'value'], 'question': 'How many heads of the departments are older than 56 ?', 'question_toks': ['How', 'many', 'heads', 'of', 'the', 'departments', 'are', 'older', 'than', '56', '?'], 'sql': {'from': {'table_units': [['table_unit', 1]], 'conds': []}, 'select': [False, [[3, [0, [0, 0, False], None]]]], 'where': [[False, 3, [0, [0, 10, False], None], 56.0, None]], 'groupBy': [], 'having': [], 'orderBy': [], 'limit': None, 'intersect': None, 'union': None, 'except': None}}, {'db_id': 'department_management', 'query': 'SELECT name ,  born_state ,  age FROM head ORDER BY age', 'query_toks': ['SELECT', 'name', ',', 'born_state', ',', 'age', 'FROM', 'head', 'ORDER', 'BY', 'age'], 'query_toks_no_value': ['select', 'nam

{'db_id': 'department_management',
 'query': 'SELECT count(*) FROM head WHERE age  >  56',
 'query_toks': ['SELECT',
  'count',
  '(',
  '*',
  ')',
  'FROM',
  'head',
  'WHERE',
  'age',
  '>',
  '56'],
 'query_toks_no_value': ['select',
  'count',
  '(',
  '*',
  ')',
  'from',
  'head',
  'where',
  'age',
  '>',
  'value'],
 'question': 'How many heads of the departments are older than 56 ?',
 'question_toks': ['How',
  'many',
  'heads',
  'of',
  'the',
  'departments',
  'are',
  'older',
  'than',
  '56',
  '?'],
 'sql': {'from': {'table_units': [['table_unit', 1]], 'conds': []},
  'select': [False, [[3, [0, [0, 0, False], None]]]],
  'where': [[False, 3, [0, [0, 10, False], None], 56.0, None]],
  'groupBy': [],
  'having': [],
  'orderBy': [],
  'limit': None,
  'intersect': None,
  'union': None,
  'except': None}}

In [24]:
from datasets import load_dataset

try:
    train_dataset = load_dataset('json', data_files='/home/yyj/Desktop/yyj/thesis/code/PETSQL/data/spider_create_context_train.json', split='train')
    print("Training dataset loaded successfully.")
    print(train_dataset[0])  # Print the first sample
except Exception as e:
    print(f"Error loading training dataset: {e}")

try:
    eval_dataset = load_dataset('json', data_files='/home/yyj/Desktop/yyj/thesis/code/PETSQL/data/spider_create_context_val.json', split='train')
    print("Validation dataset loaded successfully.")
    print(eval_dataset[0])  # Print the first sample
except Exception as e:
    print(f"Error loading validation dataset: {e}")

Downloading and preparing dataset json/default to file:///home/yyj/.cache/huggingface/datasets/json/default-a268b44d1601307f/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1831.57it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 473.02it/s]
                                                        

Dataset json downloaded and prepared to file:///home/yyj/.cache/huggingface/datasets/json/default-a268b44d1601307f/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.
Error loading training dataset: Loading a dataset cached in a LocalFileSystem is not supported.




Downloading and preparing dataset json/default to file:///home/yyj/.cache/huggingface/datasets/json/default-b7eab185ececb3eb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 11244.78it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 870.19it/s]
                                                        

Dataset json downloaded and prepared to file:///home/yyj/.cache/huggingface/datasets/json/default-b7eab185ececb3eb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.
Error loading validation dataset: Loading a dataset cached in a LocalFileSystem is not supported.




In [15]:
def load_spider_dataset(data_dir):
    """
    Load the Spider dataset from the specified directory.
    """
    dataset = load_dataset("json", data_files={
        "train": os.path.join(data_dir, "spider_create_context_train.json"),
        "dev": os.path.join(data_dir, "spider_create_context_val.json")
    })
    return dataset

In [4]:
# ## Step 4: Preprocess the Dataset
# Define a function to preprocess the dataset for fine-tuning.

# %%
def preprocess_dataset(dataset, tokenizer, max_length=512):
    """
    Preprocess the dataset for text-to-SQL fine-tuning.
    """
    def tokenize_function(examples):
        # Combine the question and schema into a single input
        inputs = [f"Question: {q} Schema: {s}" for q, s in zip(examples["question"], examples["schema"])]
        targets = examples["query"]

        # Tokenize inputs and targets
        model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
        labels = tokenizer(targets, max_length=max_length, truncation=True, padding="max_length").input_ids

        # Replace padding token IDs with -100 to ignore them in the loss calculation
        labels = [label if label != tokenizer.pad_token_id else -100 for label in labels]
        model_inputs["labels"] = labels

        return model_inputs

    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset

In [5]:
# ## Step 5: Fine-Tune the Model
# Define a function to fine-tune the Code Llama model.

def fine_tune_code_llama(model_name, dataset, output_dir, epochs=3, batch_size=8, learning_rate=5e-5):
    """
    Fine-tune the Code Llama model on the Spider dataset.
    """
    # Load the pre-trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Preprocess the dataset
    tokenized_dataset = preprocess_dataset(dataset, tokenizer)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        fp16=True,  # Enable mixed precision training
        logging_dir=os.path.join(output_dir, "logs"),
        logging_steps=10,
        report_to="none"
    )

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["dev"],
        tokenizer=tokenizer
    )

    # Fine-tune the model
    trainer.train()

    # Save the fine-tuned model
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

In [6]:
def main():




    # Fine-tune the model
    fine_tune_code_llama(model_name, dataset, output_dir, epochs, batch_size, learning_rate)


In [16]:
# Paths and parameters
model_name = "codellama/CodeLlama-34b-Instruct-hf"  # Replace with the correct model name
data_dir = "/home/yyj/Desktop/yyj/thesis/code/PETSQL/data"  # Path to the Spider dataset
output_dir = "/home/yyj/Desktop/yyj/thesis/code/PETSQL/src/sources/llms"  # Directory to save the fine-tuned model
epochs = 3
batch_size = 8
learning_rate = 5e-5

In [19]:
# Load the dataset
dataset = load_spider_dataset(data_dir)

Downloading and preparing dataset json/default to file:///home/yyj/.cache/huggingface/datasets/json/default-ca6ccea928c0020a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 13273.11it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 831.05it/s]
                                                        

Dataset json downloaded and prepared to file:///home/yyj/.cache/huggingface/datasets/json/default-ca6ccea928c0020a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.




NotImplementedError: Loading a dataset cached in a LocalFileSystem is not supported.

In [18]:
!rm -rf ~/.cache/huggingface/datasets
