# Part 2 : Preparation data and model selection

## Preparation data

In [None]:
# Libraries import
import torch
import pandas as pd
import optuna
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import  GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments

## Device setting (GPU Activation)

Explain why we install CUDA

In [2]:
if torch.cuda.is_available():
    print("GPU is available. \nUsing GPU")
    device = torch.device('cuda:0')
else:
    print("GPU is not available. \nUsing CPU")
    device = torch.device('cpu')

GPU is available. 
Using GPU


## File Upload and Dataframe manipulation 🗃️

In [3]:
df = pd.read_csv("./data/query_question_sql_copilot.csv", sep=";")
df.head()

Unnamed: 0,query,question
0,SELECT count(*) FROM head WHERE age > 56,How many heads of the departments are older th...
1,"SELECT name , born_state , age FROM head ORD...","List the name, born state and age of the heads..."
2,"SELECT creation , name , budget_in_billions ...","List the creation year, name and budget of eac..."
3,"SELECT max(budget_in_billions) , min(budget_i...",What are the maximum and minimum budget of the...
4,SELECT avg(num_employees) FROM department WHER...,What is the average number of employees of the...


In [4]:
df.isnull().sum()

query       0
question    0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88688 entries, 0 to 88687
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   query     88688 non-null  object
 1   question  88688 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB


In [6]:
df = df.astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88688 entries, 0 to 88687
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   query     88688 non-null  object
 1   question  88688 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB


In [7]:
train_df, test_df = train_test_split(df, test_size=0.10, random_state=42)

In [8]:
train_ds = Dataset.from_pandas(train_df)
train_ds

Dataset({
    features: ['query', 'question', '__index_level_0__'],
    num_rows: 79819
})

In [9]:
test_ds = Dataset.from_pandas(test_df)
test_ds

Dataset({
    features: ['query', 'question', '__index_level_0__'],
    num_rows: 8869
})

## Model loading

In [10]:
# 1. Load the tokenizer and model
checkpoint = 'distilgpt2'
tokenizer = GPT2Tokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token  # GPT2 doesn't have a pad token by default
model = GPT2LMHeadModel.from_pretrained(checkpoint)

In [34]:
def preprocess_function(examples):
    max_length = 512
    inputs = [doc for doc in examples["query"]]
    model_inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=max_length)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["question"], padding='max_length', truncation=True, max_length=max_length)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [35]:
tokenized_train = train_ds.map(preprocess_function, batched=True, batch_size=2000,
                               remove_columns=['question', 'query','__index_level_0__'])

tokenized_test = test_ds.map(preprocess_function, batched=True, batch_size=2000,
                               remove_columns=['question', 'query','__index_level_0__'])

Map: 100%|██████████| 79819/79819 [00:28<00:00, 2777.78 examples/s]
Map: 100%|██████████| 8869/8869 [00:03<00:00, 2476.27 examples/s]


In [None]:
# Instantiating Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)


training_args = TrainingArguments(
    output_dir='./gpt2_text2sql',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=tokenized_train,         # training dataset
    eval_dataset=tokenized_test             # evaluation dataset
)

In [None]:
trainer.train()

In [None]:
def objective(trial):
    num_steps = 500
    batch_size = 8
    data_size = len(tokenized_train)
    num_batches = data_size // batch_size
    num_epochs = 2
    # Suggest hyperparameters to optimize
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-7, 5e-6)
    weight_decay = trial.suggest_uniform("weight_decay", 0.01, 0.1)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    warmup_ratio = trial.suggest_uniform("warmup_ratio", 0.0, 0.3)
    focal_loss_alpha = trial.suggest_uniform("focal_loss_alpha", 0.5, 1.0)
    focal_loss_gamma = trial.suggest_int("focal_loss_gamma", 1, 3)

    # Define TrainingArguments with suggested hyperparameters
    training_args = TrainingArguments(
        output_dir="./models/finetuned_distilgpt2",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        lr_scheduler_type="cosine",
        warmup_ratio=warmup_ratio,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        save_steps=100,
        save_total_limit=10,
        dataloader_num_workers=0,
        logging_strategy="steps",
        save_strategy="epoch",
        logging_steps=10,
        use_cpu=False,
        report_to="none",
    )

    # Define Trainer
    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=tokenized_train,         # training dataset
        eval_dataset=tokenized_test             # evaluation dataset
    )
    # Train the model
    trainer.train()
    eval_result = trainer.evaluate()
    # Return the metric to minimize (or maximize)
    return eval_result["eval_loss"]


# Create a study object
study = optuna.create_study(direction="minimize")

# Optimize the objective function
study.optimize(objective, n_trials=20)

# Print the best hyperparameters
print("Best hyperparameters:", study.best_params)