# LSTM

In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, TimeDistributed, RepeatVector, Bidirectional

In [2]:
sql_syntax = pd.read_csv('queries_db_all.csv')

tokenizer_nl = Tokenizer(filters='')
tokenizer_sql = Tokenizer(filters='')

tokenizer_nl.fit_on_texts(sql_syntax['nl'])
tokenizer_sql.fit_on_texts(sql_syntax['syntax'])

nl_sequences = tokenizer_nl.texts_to_sequences(sql_syntax['nl'])
sql_sequences = tokenizer_sql.texts_to_sequences(sql_syntax['syntax'])

max_nl_length = max(len(seq) for seq in nl_sequences)
max_sql_length = max(len(seq) for seq in sql_sequences)

nl_padded = pad_sequences(nl_sequences, maxlen=max_nl_length, padding='post')
sql_padded = pad_sequences(sql_sequences, maxlen=max_sql_length, padding='post')

X = nl_padded
y = sql_padded

sql_vocab_size = len(tokenizer_sql.word_index) + 1

In [8]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer_nl.word_index) + 1, output_dim=128))
model.add(Bidirectional(LSTM(256, return_sequences=False)))
model.add(RepeatVector(max_sql_length))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(sql_vocab_size, activation='softmax')))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
y = np.expand_dims(y, axis=-1)

model.fit(X, y, epochs=20, batch_size=32, validation_split=0.2)

In [3]:
model = load_model('syntaxer.keras')

In [5]:
def decode_sequence(input_seq):
    decoded_sentence = ''
    for idx in input_seq:
        if idx == 0:
            continue
        word = tokenizer_sql.index_word.get(idx, '')
        decoded_sentence += word + ' '
    return decoded_sentence.strip()

# test_df = pd.read_csv('queries_db_all.csv')['nl']
# test_sentences = test_df.sample(5).values


test_sentence = "how many users?"
test_sequence = tokenizer_nl.texts_to_sequences([test_sentence])
test_padded = pad_sequences(test_sequence, maxlen=max_nl_length, padding='post')
predicted_sequence = model.predict(test_padded)
predicted_sentence = decode_sequence(np.argmax(predicted_sequence, axis=-1)[0])

print(f"Input: {test_sentence}")
print(f"Predicted SQL: {predicted_sentence}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Input: how many users exists?
Predicted SQL: select count(distinct [column]) [table] where [column] [column] [const] and [column] = [const];


# Transformers

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset

# Load the dataset
df = pd.read_csv('queries_db_all.csv')
df = df.head(1000)

# Prepare the dataset for transformers
dataset = Dataset.from_pandas(df[['nl', 'syntax']])

# Define the tokenizer and model
model_checkpoint = "t5-small"  # You can use other models like "t5-base" or "t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenize the dataset
def preprocess_function(examples):
    inputs = examples['nl']
    targets = examples['syntax']
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    labels = tokenizer(targets, max_length=128, truncation=True).input_ids
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Split the dataset into train and test sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

# Define the model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

print(f"Evaluation Results: {eval_results}")


In [None]:
from datasets import load_metric
import numpy as np

# Load the BLEU metric
metric = load_metric("sacrebleu")

# Generate predictions
def generate_predictions(dataset, model, tokenizer):
    inputs = tokenizer(dataset['nl'], return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model.generate(inputs.input_ids, max_length=128, num_beams=4, early_stopping=True)
    predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return predictions

predictions = generate_predictions(test_dataset, model, tokenizer)

# Compute the BLEU score
references = [[syntax] for syntax in test_dataset['syntax']]
bleu_score = metric.compute(predictions=predictions, references=references)

print(f"BLEU Score: {bleu_score}")