In [1]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
from sacrebleu import corpus_bleu

In [2]:
# Step 1: Load and preprocess the data
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

def preprocess_data(data):
    input_texts = []
    target_texts = []
    for entry in data:
        if 'src' in entry and 'tgt' in entry:
            input_texts.append(entry['src'])
            target_texts.append(entry['tgt'])
    return input_texts, target_texts

train_data = load_data('train.json')
valid_data = load_data('valid.json')
test_data = load_data('test.json')

train_input_texts, train_target_texts = preprocess_data(train_data)
valid_input_texts, valid_target_texts = preprocess_data(valid_data)
test_input_texts, test_target_texts = preprocess_data(test_data)

In [3]:
# Step 2: Tokenize the input and target texts
input_tokenizer = Tokenizer()
input_tokenizer.fit_on_texts(train_input_texts)
train_input_sequences = input_tokenizer.texts_to_sequences(train_input_texts)
valid_input_sequences = input_tokenizer.texts_to_sequences(valid_input_texts)
test_input_sequences = input_tokenizer.texts_to_sequences(test_input_texts)

target_tokenizer = Tokenizer()
target_tokenizer.fit_on_texts(train_target_texts)
train_target_sequences = target_tokenizer.texts_to_sequences(train_target_texts)
valid_target_sequences = target_tokenizer.texts_to_sequences(valid_target_texts)
test_target_sequences = target_tokenizer.texts_to_sequences(test_target_texts)

max_input_length = max(len(sequence) for sequence in train_input_sequences)
max_target_length = max(len(sequence) for sequence in train_target_sequences)

train_input_sequences = pad_sequences(train_input_sequences, maxlen=max_input_length, padding='post')
valid_input_sequences = pad_sequences(valid_input_sequences, maxlen=max_input_length, padding='post')
test_input_sequences = pad_sequences(test_input_sequences, maxlen=max_input_length, padding='post')

train_target_sequences = pad_sequences(train_target_sequences, maxlen=max_target_length+1, padding='post')
valid_target_sequences = pad_sequences(valid_target_sequences, maxlen=max_target_length+1, padding='post')
test_target_sequences = pad_sequences(test_target_sequences, maxlen=max_target_length+1, padding='post')

In [4]:
# Step 3: Define the NMT model
latent_dim = 256

# Encoder
encoder_inputs = Input(shape=(max_input_length,))
encoder_embedding = Embedding(input_dim=len(input_tokenizer.word_index) + 1, output_dim=latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_target_length,))
decoder_embedding = Embedding(input_dim=len(target_tokenizer.word_index) + 1, output_dim=latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
attention = Attention()([decoder_outputs, encoder_outputs])
decoder_concat_attention = tf.keras.layers.Concatenate(axis=-1)([decoder_outputs, attention])
decoder_dense = Dense(len(target_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_attention)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [5]:
# Step 4: Compile and train the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'], run_eagerly=True)
model.fit([train_input_sequences, train_target_sequences[:, :-1]], train_target_sequences[:, 1:], epochs=10, batch_size=128, validation_data=([valid_input_sequences, valid_target_sequences[:, :-1]], valid_target_sequences[:, 1:]))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x2a32a1d04c0>

In [12]:
import json
import zipfile
# Step 2: Make predictions on the test data
def make_predictions(model, test_data):
    predictions = []
    for entry in test_data:
        # Assuming your model.predict() function takes input data and returns predictions
        input_data = entry.get('valid_data', None)  # Get input data from entry, or use a default value if not present
        if input_data is not None:
            prediction = model.predict(input_data)  # Replace 'input_data' with your actual input data key
            predictions.append([{"id": entry['id'], "hyp": prediction}])  # Format prediction
    return predictions

# Load test data (replace 'test_data' with your actual test data)
with open('test.json', 'r') as f:
    test_data = json.load(f)

# Step 3: Generate predictions
predictions = make_predictions(model, test_data)

# Step 4: Write predictions to a JSON file
def write_predictions_to_json(predictions, output_file):
    with open(output_file, 'w') as f:
        json.dump(predictions, f)

write_predictions_to_json(predictions, 'predictions.json')

# Step 5: Archive the run file
def archive_run_file(run_file):
    with zipfile.ZipFile('run.zip', 'w') as zipf:
        zipf.write(run_file)

archive_run_file('predictions.json')

# Provide the path to your team name and the zip file containing the predictions.json file
team_name = "Hat Tricks"
zip_file_path = "run.zip"

In [10]:
from sacrebleu import corpus_bleu
def read_sequences_from_json(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return [entry['id'] for entry in data]

def calculate_bleu_score(ref_file, hyp_file):
    references = read_sequences_from_json(ref_file)
    hypotheses = read_sequences_from_json(hyp_file)
    bleu = corpus_bleu(hypotheses, [references])
    return bleu.score

In [11]:
# Example usage
ref_file = 'test.json'  # Path to reference JSON file
hyp_file = 'run.json'  # Path to hypothesis JSON file
bleu_score = calculate_bleu_score(ref_file, hyp_file)
print("BLEU score:", bleu_score)

BLEU score: 2.8653498031922857
