In [21]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import string
import ast
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Add, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# === Config ===
MAX_LEN = 64
EMBED_DIM = 64
LSTM_UNITS = 128
BATCH_SIZE = 32
EPOCHS = 10

# === Vocab (ASCII + control tokens) ===
special_tokens = ['<PAD>', '<BOS>', '<EOS>', '<UNK>']
vocab_chars = list(string.ascii_lowercase + string.digits + string.punctuation + ' ')
vocab = special_tokens + vocab_chars
token_to_id = {c: i for i, c in enumerate(vocab)}
id_to_token = {i: c for c, i in token_to_id.items()}

VOCAB_SIZE = len(vocab)

def tokenize(text):
    return [token_to_id.get(c, token_to_id['<UNK>']) for c in text.lower()]

def detokenize(ids):
    return ''.join([id_to_token.get(i, '?') for i in ids])

# === Load and preprocess dataset ===
def load_data(path):
    df = pd.read_csv(path)

    # Parse list columns
    df['personas'] = df['personas'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])
    df['previous_utterance'] = df['previous_utterance'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else [])
    df['free_messages'] = df['free_messages'].apply(lambda x: ast.literal_eval(x)[0] if pd.notnull(x) else "")

    df['input_text'] = df.apply(lambda row: " ".join(row['personas'] + row['previous_utterance']), axis=1)
    df['target_text'] = df['free_messages']

    # Filter empty targets
    df = df[df['target_text'].str.len() > 0]

    inputs, targets = [], []

    for input_txt, target_txt in zip(df['input_text'], df['target_text']):
        input_ids = tokenize(input_txt)[:MAX_LEN]
        target_ids = [token_to_id['<BOS>']] + tokenize(target_txt)[:MAX_LEN - 2] + [token_to_id['<EOS>']]

        inputs.append(input_ids)
        targets.append(target_ids)

    X = pad_sequences(inputs, maxlen=MAX_LEN, padding='post', value=token_to_id['<PAD>'])
    Y = pad_sequences(targets, maxlen=MAX_LEN, padding='post', value=token_to_id['<PAD>'])

    return X, Y

# === Positional Embedding Layer ===
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, max_len, embed_dim):
        super().__init__()
        self.pos_emb = Embedding(input_dim=max_len, output_dim=embed_dim)

    def call(self, x):
        pos = tf.range(start=0, limit=tf.shape(x)[1], delta=1)
        pos = self.pos_emb(pos)
        return x + pos

# === Attention ===
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, enc_out, dec_hidden):
        # enc_out: (batch, time, hidden), dec_hidden: (batch, hidden)
        dec_hidden_expanded = tf.expand_dims(dec_hidden, 1)
        score = self.V(tf.nn.tanh(self.W1(enc_out) + self.W2(dec_hidden_expanded)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * enc_out
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights

# === Build Model ===
def build_model():
    encoder_inputs = Input(shape=(MAX_LEN,), name='encoder_input')
    x = Embedding(VOCAB_SIZE, EMBED_DIM, mask_zero=True)(encoder_inputs)
    x = PositionalEmbedding(MAX_LEN, EMBED_DIM)(x)
    lstm_out, state_h, state_c = LSTM(LSTM_UNITS, return_sequences=True, return_state=True)(x)

    decoder_inputs = Input(shape=(MAX_LEN,), name='decoder_input')
    x2 = Embedding(VOCAB_SIZE, EMBED_DIM, mask_zero=True)(decoder_inputs)
    x2 = PositionalEmbedding(MAX_LEN, EMBED_DIM)(x2)
    decoder_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=False)
    decoder_out = decoder_lstm(x2)

    attention = BahdanauAttention(LSTM_UNITS)
    context_vector, _ = attention(lstm_out, state_h)

    context_repeated = tf.expand_dims(context_vector, 1)
    repeated = tf.repeat(context_repeated, repeats=MAX_LEN, axis=1)

    concat = Concatenate()([repeated, decoder_out])
    output = Dense(VOCAB_SIZE, activation='softmax')(concat)

    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

# === Main training workflow ===
def main():
    print("🚀 Loading and processing data...")
    X, Y = load_data("your_dataset.csv")
    Y_out = np.expand_dims(Y, -1)
    X_train, X_val, Y_train, Y_val = train_test_split(X, Y_out, test_size=0.1)

    print("🧠 Building model...")
    model = build_model()

    print("🎯 Training...")
    model.fit([X_train, X_train], Y_train, validation_data=([X_val, X_val], Y_val),
              batch_size=BATCH_SIZE, epochs=EPOCHS)

    print("💾 Saving model...")
    model.save("model.h5")

    print("🧬 Saving tokenizer...")
    with open("tokenizer.json", "w") as f:
        json.dump(token_to_id, f)

    print("✅ All done!")

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'pandas'