In [2]:
import os
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Concatenate
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd

class Encoder:
    def __init__(self,encoder_model,tokenizer):
        self.encoder_model = encoder_model
        self.tokenizer=tokenizer

    def predict(self,input_string):
        input_string = '#'+input_string+'@'
        input_sequence = tokenizer.texts_to_sequences([input_string])
        input_sequence = pad_sequences(input_sequence, maxlen=10, padding='post', truncating='post')
        return encoder_model.predict(input_sequence)

class Decoder:
    def __init__(self,decoder_model,tokenizer):
        self.decoder_model = decoder_model
        self.tokenizer=tokenizer

    def predict(self,input_tensor):
        target_seq = np.zeros((1, 1), dtype=np.int32)
        target_seq[0, 0] = tokenizer.word_index['#']
        decoded_sentence = ''

        for _ in range(10):
            output_tokens, state_h, state_c = decoder_model.predict([target_seq] + input_tensor)
            sampled_token_index = np.argmax(output_tokens[0, -1, :])

            if sampled_token_index == tokenizer.word_index['@']:
                break

            sampled_char = tokenizer.index_word[sampled_token_index]
            decoded_sentence += sampled_char
            target_seq = np.zeros((1, 1), dtype=np.int32)
            target_seq[0, 0] = sampled_token_index
            input_tensor = [state_h, state_c]

        return decoded_sentence

MODEL_PATH = os.getcwd()

if "encoder.h5" in os.listdir(MODEL_PATH):
    encoder_model = load_model(os.path.join(MODEL_PATH, 'encoder.h5'))
    decoder_model = load_model(os.path.join(MODEL_PATH, 'decoder.h5'))
    with open(os.path.join(MODEL_PATH, 'tokenizer.pkl'), 'rb') as fp:
        tokenizer = pickle.load(fp)

else:
    train_data = open("train_data.csv").read().split("\n")
    sentences = [line.split(",")[0] for line in train_data if len(line) > 0]
    sentences = sentences[1:]
    transformed_sentences = [line.split(",")[1] for line in train_data if len(line) > 0]
    transformed_sentences = transformed_sentences[1:]

    eval_data = open("eval_data.csv").read().split("\n")
    eval_sentences = [line.split(",")[0] for line in eval_data if len(line) > 0]
    eval_sentences = eval_sentences[1:]
    eval_transformed_sentences = [line.split(",")[1] for line in eval_data if len(line) > 0]
    eval_transformed_sentences = eval_transformed_sentences[1:]

    sentences = ['#' + sentence + '@' for sentence in sentences]
    transformed_sentences = ['#' + sentence + '@' for sentence in transformed_sentences]
    eval_sentences = ['#' + sentence + '@' for sentence in eval_sentences]
    eval_transformed_sentences = ['#' + sentence + '@' for sentence in eval_transformed_sentences]

    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(sentences + transformed_sentences)
    sentences_seq = tokenizer.texts_to_sequences(sentences)
    transformed_sentences_seq = tokenizer.texts_to_sequences(transformed_sentences)
    sentences_seq = pad_sequences(sentences_seq, maxlen=10, padding='post', truncating='post')
    transformed_sentences_seq = pad_sequences(transformed_sentences_seq, maxlen=10, padding='post', truncating='post')
    eval_sentences_seq = tokenizer.texts_to_sequences(eval_sentences)
    eval_transformed_sentences_seq = tokenizer.texts_to_sequences(eval_transformed_sentences)
    eval_sentences_seq = pad_sequences(eval_sentences_seq, maxlen=10, padding='post', truncating='post')
    eval_transformed_sentences_seq = pad_sequences(eval_transformed_sentences_seq, maxlen=10, padding='post',
                                                   truncating='post')

    vocab_size = len(tokenizer.word_index) + 1
    embedding_dim = 4096
    hidden_units = 256
    dropout_rate = 0.2  # Dropout rate

    def create_dataset(encoder_input, decoder_input):
        target = tf.roll(decoder_input, shift=-1, axis=1)
        dataset = tf.data.Dataset.from_tensor_slices(((encoder_input, decoder_input), target))
        return dataset

    BUFFER_SIZE = len(sentences_seq)
    BATCH_SIZE = 16

    train_dataset = create_dataset(sentences_seq, transformed_sentences_seq).shuffle(
        BUFFER_SIZE).repeat().batch(BATCH_SIZE, drop_remainder=True)

    eval_dataset = create_dataset(eval_sentences_seq, eval_transformed_sentences_seq).batch(
        BATCH_SIZE, drop_remainder=True)

    encoder_inputs = Input(shape=(10,), name="encoder_input")
    encoder_inputs_embedded = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
    encoder_rnn = Bidirectional(LSTM(hidden_units, return_sequences=True, return_state=True, dropout=dropout_rate,
                                     recurrent_dropout=dropout_rate))
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_rnn(encoder_inputs_embedded)
    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])
    encoder_state = [state_h, state_c]

    
    decoder_inputs = Input(shape=(None,), name="decoder_input")
    decoder_inputs_embedded = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_inputs)
    decoder_lstm = LSTM(hidden_units * 2, return_sequences=True, return_state=True, dropout=dropout_rate,
                         recurrent_dropout=dropout_rate)
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs_embedded, initial_state=encoder_state)
    decoder_dense = Dense(vocab_size, activation='softmax')
    predictions = decoder_dense(decoder_outputs)

    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=predictions)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    STEPS_PER_EPOCH = len(sentences_seq) // BATCH_SIZE
    EPOCHS = 50

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    history = model.fit(
        train_dataset,
        steps_per_epoch=STEPS_PER_EPOCH,
        validation_data=eval_dataset,
        epochs=EPOCHS,
        callbacks=[early_stopping]
    )

    encoder_model = Model(inputs=encoder_inputs, outputs=encoder_state)
    
    decoder_state_input_h = Input(shape=(hidden_units * 2,), name="decoder_state_input_h")
    decoder_state_input_c = Input(shape=(hidden_units * 2,), name="decoder_state_input_c")
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs_embedded, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    predictions = decoder_dense(decoder_outputs)
    decoder_model = Model(inputs=[decoder_inputs] + decoder_states_inputs, outputs=[predictions] + decoder_states)

    encoder_model.save(os.path.join(MODEL_PATH, 'encoder.h5'))
    decoder_model.save(os.path.join(MODEL_PATH, 'decoder.h5'))
    with open(os.path.join(MODEL_PATH, 'tokenizer.pkl'), 'wb') as fp:
        pickle.dump(tokenizer, fp)

# Function to check how many characters match in the two strings
def check(pred: str, true: str):
    correct = 0
    for a, b in zip(pred, true):
        if a == b:
            correct += 1

    # Prediction is more than 8 letters, so penalize for every extra letter.
    correct -= max(0, len(pred) - len(true))
    correct = max(0, correct)
    return correct

# Function to score the model's performance
def evaluate(encoder, decoder):
    # Train data
    print("Obtaining results for training data:")
    train_data = pd.read_csv("train_data.csv").to_numpy()
    results = {
        "pred": [],
        "true": [],
        "score": [],
    }
    correct = [0 for _ in range(9)]
    for x, y in train_data:
        pred = decoder.predict(encoder.predict(x))
        score = check(pred, y)
        results["pred"].append(pred)
        results["true"].append(y)
        results["score"].append(score)

        correct[score] += 1
    print("Train dataset results:")
    for num_chr in range(9):
        print(
            f"Number of predictions with {num_chr} correct predictions: {correct[num_chr]}"
        )
    points = sum(correct[4:6]) * 0.5 + sum(correct[6:])
    print(f"Points: {points}")
    # Save predicitons and true sentences to inspect manually if required.
    pd.DataFrame.from_dict(results).to_csv("results_train.csv", index=False)

    #----------------------------------------------------------------------------------

    print("Obtaining metrics for eval data:")
    eval_data = pd.read_csv("eval_data.csv").to_numpy()
    results = {
        "pred": [],
        "true": [],
        "score": [],
    }
    correct = [0 for _ in range(9)]
    for x, y in eval_data:
        pred = decoder.predict(encoder.predict(x))
        score = check(pred, y)
        results["pred"].append(pred)
        results["true"].append(y)
        results["score"].append(score)

        correct[score] += 1
    print("Eval dataset results:")
    for num_chr in range(9):
        print(
            f"Number of predictions with {num_chr} correct predictions: {correct[num_chr]}"
        )
    points = sum(correct[4:6]) * 0.5 + sum(correct[6:])
    marks = round(min(2, points / 1400 * 2) * 2) / 2  # Rounds to the nearest 0.5
    print(f"Points: {points}")
    print(f"Marks: {marks}")
    # Save predicitons and true sentences to inspect manually if required.
    pd.DataFrame.from_dict(results).to_csv("results_eval.csv", index=False)

encoder = Encoder(encoder_model,tokenizer)
decoder = Decoder(decoder_model,tokenizer)
evaluate(encoder,decoder)


Obtaining metrics for eval data:
Eval dataset results:
Number of predictions with 0 correct predictions: 0
Number of predictions with 1 correct predictions: 0
Number of predictions with 2 correct predictions: 1
Number of predictions with 3 correct predictions: 1
Number of predictions with 4 correct predictions: 21
Number of predictions with 5 correct predictions: 103
Number of predictions with 6 correct predictions: 297
Number of predictions with 7 correct predictions: 649
Number of predictions with 8 correct predictions: 928
Points: 1936.0
Marks: 2.0
