In [72]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd



In [73]:
df = pd.read_csv("./processed/final_data.csv")

train_dataset = tf.data.Dataset.from_tensor_slices((df["question_header"], df["sql"]))

train_size = int(0.8 * len(df))
train_dataset = train_dataset.take(train_size)
val_dataset = train_dataset.skip(train_size)

batch_size = 32
train_dataset = train_dataset.shuffle(buffer_size=train_size).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)

type(train_dataset)
questions = ""
SQL = ""

# train_dataset.tak
for question, sql in train_dataset.take(1):
    questions = question.numpy()
    SQL = sql.numpy()
    # print("Question:", question.numpy())
    # print("SQL:", sql.numpy())
    # break
questions.shape
SQL.shape
# prints a batch

(32,)

In [74]:

# Concatenate SQL queries and natural language questions
texts = df['sql'].tolist() + df['question_header'].tolist()

# Initialize tokenizer
tokenizer = Tokenizer(filters='')

# Fit tokenizer on texts
tokenizer.fit_on_texts(texts)

# Convert texts to sequences of token IDs
sequences_sql = tokenizer.texts_to_sequences(df['sql'])
sequences_questions = tokenizer.texts_to_sequences(df['question_header'])

# Example usage of token IDs
print("Token IDs for first SQL query:", sequences_sql[0])
print("Token IDs for first question:", sequences_questions[0])

# tokenizer.save('sql_tokenizer')


Token IDs for first SQL query: [4, 200, 3, 302, 9405, 1, 256, 714]
Token IDs for first question: [253, 247, 8, 2, 200, 110, 14, 256, 714, 3595, 13244, 1555, 190, 302, 9405, 302, 63, 200]


In [75]:
len(sequences_sql)
sequences_sql[0]

[4, 200, 3, 302, 9405, 1, 256, 714]

In [76]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

In [77]:
# Load your preprocessed dataset
import pandas as pd
dataset = pd.read_csv("./processed/final_data.csv")


train_data, val_data = train_test_split(dataset, test_size=0.2)


In [78]:
# Tokenize the English sentences
tokenizer_en = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer_en.fit_on_texts(train_data['question_header'])
train_inputs = tokenizer_en.texts_to_sequences(train_data['question_header'])
val_inputs = tokenizer_en.texts_to_sequences(val_data['question_header'])

# Tokenize the SQL queries
tokenizer_sql = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer_sql.fit_on_texts(train_data['sql'])
train_outputs = tokenizer_sql.texts_to_sequences(train_data['sql'])
val_outputs = tokenizer_sql.texts_to_sequences(val_data['sql'])


In [79]:
# Pad sequences to the same length
max_length = max(len(seq) for seq in train_inputs + val_inputs)
train_inputs = pad_sequences(train_inputs, maxlen=max_length, padding='post')
val_inputs = pad_sequences(val_inputs, maxlen=max_length, padding='post')
train_outputs = pad_sequences(train_outputs, maxlen=max_length, padding='post')
val_outputs = pad_sequences(val_outputs, maxlen=max_length, padding='post')

In [80]:

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_outputs)).shuffle(len(train_data)).batch(batch_size)
val_dataset = tf.data.Dataset.from_tensor_slices((val_inputs, val_outputs)).batch(batch_size)


In [81]:
import tensorflow as tf

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TransformerBlock, self).__init__()

        self.mha = tf.keras.layers.MultiHeadAttention(num_heads, d_model)
        self.ffn = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation='relu'),
            tf.keras.layers.Dense(d_model)
        ])

        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training=False):
        attn_output = self.mha(inputs, inputs, inputs)  # Self-attention
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layer_norm1(inputs + attn_output)  # Residual connection and layer normalization

        ffn_output = self.ffn(out1)  # Feed-forward network
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layer_norm2(out1 + ffn_output)  # Residual connection and layer normalization

        return out2


class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()
        self.encoder = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.decoder = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.num_layers = num_layers
        self.transformer_blocks = [TransformerBlock(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs, training=True):
        enc_inputs = inputs[0]
        dec_inputs = inputs[1]
        
        enc_padding_mask = inputs[2]
        look_ahead_mask = inputs[3]
        dec_padding_mask = inputs[4]
        
        enc_inputs = enc_inputs + self.positional_encoding(enc_inputs.shape[1], d_model)
        dec_inputs = dec_inputs + self.positional_encoding(dec_inputs.shape[1], d_model)

        enc_output = self.encoder(enc_inputs)
        dec_output = self.decoder(dec_inputs)

        for i in range(self.num_layers):
            enc_output = self.transformer_blocks[i](enc_output, training=training)
            dec_output = self.transformer_blocks[i](dec_output, training=training)

        final_output = self.final_layer(dec_output)
        return final_output

# Function to create positional encodings
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
    # Apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    # Apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
    return tf.cast(pos_encoding, dtype=tf.float32)


In [82]:
# Define hyperparameters
learning_rate = 0.001
num_epochs = 10 
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
input_vocab_size = len(tokenizer_en.word_index) + 1
target_vocab_size = len(tokenizer_sql.word_index) + 1
dropout_rate = 0.1
input_length = max_length
target_length = max_length



In [83]:
# Initialize and compile the model
transformer = Transformer(num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, input_length, target_length, dropout_rate)
optimizer = tf.keras.optimizers.Adam(learning_rate)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)
transformer.compile(optimizer=optimizer, loss=loss_function, run_eagerly=True)




In [84]:
transformer.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)


Epoch 1/10


IndexError: Exception encountered when calling layer 'softmax_1' (type Softmax).

tuple index out of range

Call arguments received by layer 'softmax_1' (type Softmax):
  • inputs=tf.Tensor(shape=(114, 8), dtype=float32)
  • mask=None