In [17]:
import os
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [18]:
np.random.seed(42)
tf.random.set_seed(42)



In [19]:
file_path = 'Datasets/dinosaur_dataset.csv'
weights_file = 'Model_save/dino_translator_bidirectional_emb.h5'

In [20]:
embedding_dim = 128
latent_dim = 512
start_token = '\t'
end_token = '\n'

In [21]:
df = pd.read_csv(file_path)
df.dropna(subset=['english', 'dinosaur'], inplace=True)

def normalize_text(text):
    text = str(text).strip().lower()
    text = re.sub(r'[^a-z0-9\?\.\!,;:\'\-\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'([?.!,;:])', r' \1 ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['english'] = df['english'].apply(normalize_text)
df['dinosaur'] = df['dinosaur'].apply(normalize_text)

input_texts = df['english'].tolist()
target_texts = [start_token + t + end_token for t in df['dinosaur'].tolist()]


input_chars = sorted(list(set(''.join(input_texts))))
target_chars = sorted(list(set(''.join(target_texts))))

input_token_index = {c: i+1 for i, c in enumerate(input_chars)}
target_token_index = {c: i+1 for i, c in enumerate(target_chars)}
if start_token not in target_token_index:
    target_token_index[start_token] = max(target_token_index.values()) + 1
if end_token not in target_token_index:
    target_token_index[end_token] = max(target_token_index.values()) + 1

reverse_target_char_index = {i: c for c, i in target_token_index.items()}
reverse_input_char_index = {i: c for c, i in input_token_index.items()}

num_encoder_tokens = len(input_token_index) + 1
num_decoder_tokens = len(target_token_index) + 1
max_encoder_seq_length = max(len(t) for t in input_texts)
max_decoder_seq_length = max(len(t) for t in target_texts)

def texts_to_sequences(texts, token_index):
    seqs = [[token_index[c] for c in t if c in token_index] for t in texts]
    return seqs


encoder_inputs = Input(shape=(None,), name='encoder_inputs')
decoder_inputs = Input(shape=(None,), name='decoder_inputs')

encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=embedding_dim, mask_zero=True, name='encoder_embedding')
decoder_embedding = Embedding(input_dim=num_decoder_tokens, output_dim=embedding_dim, mask_zero=True, name='decoder_embedding')

enc_embedded = encoder_embedding(encoder_inputs)
dec_embedded = decoder_embedding(decoder_inputs)

encoder_bi = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.2, recurrent_dropout=0.1), name='encoder_bidirectional')
enc_outputs_and_states = encoder_bi(enc_embedded)
encoder_outputs = enc_outputs_and_states[0]
state_f_h, state_f_c, state_b_h, state_b_c = enc_outputs_and_states[1:5]
state_h = Concatenate()([state_f_h, state_b_h])
state_c = Concatenate()([state_f_c, state_b_c])
state_h = Dense(latent_dim, activation='tanh', name='state_h_projection')(state_h)
state_c = Dense(latent_dim, activation='tanh', name='state_c_projection')(state_c)
encoder_states_for_decoder = [state_h, state_c]

decoder_lstm_1 = LSTM(latent_dim, return_sequences=True, return_state=True,
                      dropout=0.2, recurrent_dropout=0.1, name='decoder_lstm_1')
decoder_out_1, dec_h1, dec_c1 = decoder_lstm_1(dec_embedded, initial_state=encoder_states_for_decoder)
decoder_dropout = Dropout(0.3)(decoder_out_1)

decoder_lstm_2 = LSTM(latent_dim, return_sequences=True, return_state=True,
                      dropout=0.2, recurrent_dropout=0.1, name='decoder_lstm_2')
decoder_out_2, dec_h2, dec_c2 = decoder_lstm_2(decoder_dropout)

decoder_dense = Dense(num_decoder_tokens, activation='softmax', name='decoder_output')
decoder_outputs = decoder_dense(decoder_out_2)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.load_weights(weights_file)
print(" Greutățile modelului au fost încărcate.")


encoder_model_inf = Model(encoder_inputs, [encoder_outputs, state_h, state_c])

decoder_single_input = Input(shape=(1,), name='decoder_single_input')
dec_emb_single = decoder_embedding(decoder_single_input)
dec_state_h1 = Input(shape=(latent_dim,), name='dec_state_h1')
dec_state_c1 = Input(shape=(latent_dim,), name='dec_state_c1')
dec_state_h2 = Input(shape=(latent_dim,), name='dec_state_h2')
dec_state_c2 = Input(shape=(latent_dim,), name='dec_state_c2')

dec_out_1_step, new_h1, new_c1 = decoder_lstm_1(dec_emb_single, initial_state=[dec_state_h1, dec_state_c1])
dec_out_1_step = Dropout(0.3)(dec_out_1_step)
dec_out_2_step, new_h2, new_c2 = decoder_lstm_2(dec_out_1_step, initial_state=[dec_state_h2, dec_state_c2])
dec_pred_step = decoder_dense(dec_out_2_step)

decoder_model_inf = Model(
    [decoder_single_input, dec_state_h1, dec_state_c1, dec_state_h2, dec_state_c2],
    [dec_pred_step, new_h1, new_c1, new_h2, new_c2]
)

 Greutățile modelului au fost încărcate.


In [22]:
def sample_with_temperature(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.maximum(preds, 1e-8)  # evităm valori negative
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return np.random.choice(len(preds), p=preds)

In [26]:
def translate_english_to_dinosaur(english_text, temperature=0.1, max_output_length=None):
    if max_output_length is None:
        max_output_length = max_decoder_seq_length

    normalized = normalize_text(english_text)
    seq = [input_token_index[c] for c in normalized if c in input_token_index]
    seq = seq[:max_encoder_seq_length]
    encoder_seq = pad_sequences([seq], maxlen=max_encoder_seq_length, padding='post')

    enc_outs, enc_h_proj, enc_c_proj = encoder_model_inf.predict(encoder_seq, verbose=0)
    h1, c1 = enc_h_proj, enc_c_proj
    h2, c2 = np.zeros((1, latent_dim)), np.zeros((1, latent_dim))
    start_idx = target_token_index[start_token]
    target_seq = np.array([[start_idx]], dtype='int32')

    decoded_chars = []
    for _ in range(max_output_length + 10):
        preds, new_h1, new_c1, new_h2, new_c2 = decoder_model_inf.predict(
            [target_seq, h1, c1, h2, c2], verbose=0
        )
        preds = preds[0, 0]
        sampled_idx = sample_with_temperature(preds, temperature=temperature)
        if sampled_idx == 0:
            break
        sampled_char = reverse_target_char_index.get(sampled_idx, '')
        if sampled_char == end_token or sampled_char == '':
            break
        decoded_chars.append(sampled_char)
        target_seq = np.array([[sampled_idx]], dtype='int32')
        h1, c1, h2, c2 = new_h1, new_c1, new_h2, new_c2

    return ''.join(decoded_chars).strip()

In [None]:

output_folder = "translations"
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, "output.csv")


try:
    test_df = pd.read_csv('Datasets/test-input.csv')
    english_column = next((c for c in ['english', 'text', 'sentence', 'input', 'phrase'] 
                           if c in test_df.columns), test_df.columns[0])
    test_phrases = test_df[english_column].head(28).tolist()
except FileNotFoundError:
    pass

print("\nTESTARE TRADUCERI")
translated_sentences = []

for i, phrase in enumerate(test_phrases, 1):
    pred = translate_english_to_dinosaur(phrase, temperature=0.1)
    translated_sentences.append(pred)
    print(f"[{i}] EN: '{phrase}' -> DN: '{pred}'")


pd.DataFrame({"dinosaur_translation": translated_sentences}).to_csv(output_file, index=False)
print(f"\n✅ Traducerile au fost salvate în '{output_file}'")


TESTARE TRADUCERI
[1] EN: 'Sigmoid organized jurassic AI conferences.' -> DN: 'sriigmrooriid roorgraanriizrooriid riintreellriigreencree .'
[2] EN: 'When did Sigmoid teach jurassic AI?' -> DN: 'whreen driid scriireentriistsraaraar traaruught nreeruurraal ?'
[3] EN: 'The jurassic era democratized AI education.' -> DN: 'zraa jruurraassriic preerriirood hraad draatraa scriireencree krraallreengreesraaraar .'
[4] EN: 'The jurassic period had high sea levels.' -> DN: 'zraa jruurraassriic preerriirood hraad draatraa scriireencree krraallreengreesraaraar .'
[5] EN: 'Dinosaurs learned about neural networks.' -> DN: 'driinroosraaruursraaraar lreeraarnrooriid raabrooruut nreetwroorksraaraar .'
[6] EN: 'Who studied the extinction event?' -> DN: 'whreen driid zraa jruurraassriic preerriirood hraad draatraa scriireencree krraallreengreesraaraar .'
[7] EN: 'Sigmoid taught jurassic computer vision.' -> DN: 'sriigmrooriid traaruught riintreellriigreencree croompruutreer .'
[8] EN: 'The triassic perio

In [28]:
output_folder = "translations"
os.makedirs(output_folder, exist_ok=True)
output_file = os.path.join(output_folder, "translated_sentences.csv")

translated_sentences = []

for i, phrase in enumerate(test_phrases, 1):
    pred = translate_english_to_dinosaur(phrase, temperature=0.1)
    translated_sentences.append(pred)
    
    
    pd.DataFrame({"dinosaur_translation": translated_sentences}).to_csv(output_file, index=False)
