In [0]:
!pip install torch transformers langchain faiss-cpu
!pip install -U langchain-community sentence_transformers

In [0]:
# import necessary tookets
import numpy as np
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import faiss

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
# import data
df = pd.read_csv('test.csv')
df = df.sample(1000)

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_questions = train_df['question'].tolist()
train_sql_queries = train_df['sql'].tolist()
val_questions = val_df['question'].tolist()
val_sql_queries = val_df['sql'].tolist()

In [0]:
# setup the tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

train_tokenized_inputs = tokenizer.batch_encode_plus(train_questions, padding=True, truncation=True, return_tensors='pt')
train_tokenized_outputs = tokenizer.batch_encode_plus(train_sql_queries, padding=True, truncation=True, return_tensors='pt')
val_tokenized_inputs = tokenizer.batch_encode_plus(val_questions, padding=True, truncation=True, return_tensors='pt')
val_tokenized_outputs = tokenizer.batch_encode_plus(val_sql_queries, padding=True, truncation=True, return_tensors='pt')

In [0]:
# set up the preprocessing
class SQLOnlineDataset(Dataset):
    def __init__(self, tokenized_inputs, tokenized_outputs):
        self.input_ids = tokenized_inputs['input_ids']
        self.attention_mask = tokenized_inputs['attention_mask']
        self.labels = tokenized_outputs['input_ids']
        self.decoder_attention_mask = tokenized_outputs['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
            'labels': self.labels[idx],
            'decoder_attention_mask': self.decoder_attention_mask[idx]
        }

train_dataset = SQLOnlineDataset(train_tokenized_inputs, train_tokenized_outputs)
val_dataset = SQLOnlineDataset(val_tokenized_inputs, val_tokenized_outputs)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False)

In [0]:
# create the model
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
optimizer = AdamW(model.parameters(), lr=3e-5)

In [0]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        decoder_attention_mask = batch['decoder_attention_mask'].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            decoder_attention_mask=decoder_attention_mask,
            return_dict=True
        )
        loss = outputs.loss
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}')

    # evaluate the model
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            decoder_attention_mask = batch['decoder_attention_mask'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
                decoder_attention_mask=decoder_attention_mask,
                return_dict=True
            )
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}')

In [0]:
# save the model
torch.save(model.state_dict(), 'sql_model.pt')

In [0]:
# open the model
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model.load_state_dict(torch.load('sql_model.pt'))
model.eval()

In [0]:
# initialize the tokenizert
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# sample question
new_question = "What team has more than 49 laps and a grid of 8?"
input_ids = tokenizer.encode(new_question, return_tensors='pt')
outputs = model.generate(
    input_ids=input_ids,
    max_length=150,
    num_beams=5,
    early_stopping=True,
    length_penalty=1.0
)
sql_query = tokenizer.decode(outputs[0], skip_special_tokens=True)

# print out the result
print(f"Question: {new_question}")
print(f"Generated SQL query: {sql_query}")