In [None]:
import os 
import re
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Bidirectional, Concatenate, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
file_path = 'dinosaur_dataset.csv'
embedding_dim = 128
latent_dim = 512
batch_size = 128
epochs = 40
validation_split = 0.2
start_token = '\t'
end_token = '\n'


In [None]:
if not os.path.exists(file_path):
	print("file not posible for open")
	

In [None]:
df = pd.read_csv(file_path)
df.dropna(subset=['dinosaur', 'english'], ibplace=True)


In [None]:
def normalize_text(text):
    text = str(text).strip().lower()
    # pÄƒstreazÄƒ litere, cifre simple, semne de punctuaÈ›ie È™i spaÈ›iu
    text = re.sub(r'[^a-z0-9\?\.\!,;:\'\-\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'([?.!,;:])', r' \1 ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
df['english'] = df['english'].apply(normalize_text)
df['dinosaur'] = df['dinosaur'].apply(normalize_text)

In [None]:
input_texts = df['english'].tolist()
target_texts = [start_token + t + end_token for t in df['dinosaur'].tolist()]

In [None]:
input_chars = sorted(list(set(''.join(input_texts))))
target_chars = sorted(list(set(''.join(target_texts))))

In [None]:
input_token_index = {char: i+1 for i, char in enumerate(input_chars)}
target_token_index = {char: i+1 for i, char in enumerate(target_chars)}

In [None]:
if start_token not in target_token_index:
    target_token_index[start_token] = max(target_token_index.values(), default=0) + 1
if end_token not in target_token_index:
    target_token_index[end_token] = max(target_token_index.values(), default=0) + 1

In [None]:
reverse_target_char_index = {i: char for char, i in target_token_index.items()}
reverse_input_char_index = {i: char for char, i in input_token_index.items()}

num_encoder_tokens = len(input_token_index) + 1  # +1 pentru pad(0)
num_decoder_tokens = len(target_token_index) + 1

# lungimi maxime
max_encoder_seq_length = max([len(txt) for txt in input_texts])
max_decoder_seq_length = max([len(txt) for txt in target_texts])

print(f"Exemple: {len(input_texts)}")
print(f"Vocab encoder (incl pad): {num_encoder_tokens}")
print(f"Vocab decoder (incl pad): {num_decoder_tokens}")
print(f"Max encoder len: {max_encoder_seq_length}")
print(f"Max decoder len: {max_decoder_seq_length}")

In [None]:
def texts_to_sequences(texts, token_index):
    seqs = []
    for t in texts:
        s = [token_index[ch] for ch in t if ch in token_index]
        seqs.append(s)
    return seqs

encoder_seq = texts_to_sequences(input_texts, input_token_index)
decoder_seq = texts_to_sequences(target_texts, target_token_index)

encoder_input_data = pad_sequences(encoder_seq, maxlen=max_encoder_seq_length, padding='post')
decoder_input_data = pad_sequences(decoder_seq, maxlen=max_decoder_seq_length, padding='post')

In [None]:
decoder_target_seq = []
for s in decoder_seq:
    
    if len(s) > 1:
        shifted = s[1:]
    else:
        shifted = []
    # pad right
    shifted = shifted + [0] * (max_decoder_seq_length - len(shifted))
    decoder_target_seq.append(shifted)
decoder_target_data = np.array(decoder_target_seq, dtype='int32')  


In [None]:
encoder_inputs = Input(shape=(None,), name='encoder_inputs')   # (batch, src_len)
decoder_inputs = Input(shape=(None,), name='decoder_inputs')   # (batch, targ_len)


encoder_embedding = Embedding(input_dim=num_encoder_tokens, output_dim=embedding_dim, mask_zero=True, name='encoder_embedding')
decoder_embedding = Embedding(input_dim=num_decoder_tokens, output_dim=embedding_dim, mask_zero=True, name='decoder_embedding')

enc_embedded = encoder_embedding(encoder_inputs)  # (batch, src_len, emb)
dec_embedded = decoder_embedding(decoder_inputs)  # (batch, targ_len, emb)


encoder_bi = Bidirectional(
    LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.2, recurrent_dropout=0.1),
    name='encoder_bidirectional'
)

In [None]:

enc_outputs_and_states = encoder_bi(enc_embedded)
encoder_outputs = enc_outputs_and_states[0] 
state_f_h, state_f_c, state_b_h, state_b_c = enc_outputs_and_states[1:5]


state_h_concat = Concatenate()([state_f_h, state_b_h])  
state_c_concat = Concatenate()([state_f_c, state_b_c])  

state_h = Dense(latent_dim, activation='tanh', name='state_h_projection')(state_h_concat)
state_c = Dense(latent_dim, activation='tanh', name='state_c_projection')(state_c_concat)
encoder_states_for_decoder = [state_h, state_c]


decoder_lstm_1 = LSTM(latent_dim, return_sequences=True, return_state=True,
                      dropout=0.2, recurrent_dropout=0.1, name='decoder_lstm_1')
decoder_out_1, dec_h1, dec_c1 = decoder_lstm_1(dec_embedded, initial_state=encoder_states_for_decoder)
decoder_dropout = Dropout(0.3, name='decoder_dropout')(decoder_out_1)

decoder_lstm_2 = LSTM(latent_dim, return_sequences=True, return_state=True,
                      dropout=0.2, recurrent_dropout=0.1, name='decoder_lstm_2')
decoder_out_2, dec_h2, dec_c2 = decoder_lstm_2(decoder_dropout)  


decoder_dense = TimeDistributed(Dense(num_decoder_tokens, activation='softmax'), name='decoder_output')
decoder_outputs = decoder_dense(decoder_out_2)  # (batch, targ_len, vocab_size)


model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:

try:
    
    try:
        optimizer = tf.keras.optimizers.AdamW(learning_rate=1e-3, weight_decay=1e-5)
    except Exception:
        optimizer = tf.keras.optimizers.experimental.AdamW(learning_rate=1e-3, weight_decay=1e-5)
    print("Folosesc AdamW optimizer.")
except Exception:
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
    print("AdamW indisponibil, folosesc Adam optimizer (fallback).")


model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
print("\nðŸ“Š Arhitectura (training):")
model.summary()

In [None]:

early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=7, min_lr=1e-6, verbose=1)

# --------------------------

print(f"\nðŸš€ ÃŽncepe antrenamentul cu {len(encoder_input_data)} exemple...")
history = model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,               
    batch_size=batch_size,
    epochs=epochs,
    validation_split=validation_split,
    callbacks=[early_stop, reduce_lr],
    shuffle=True,
    verbose=1
)


model.save('dino_translator_bidirectional_emb.h5')
print("\nâœ… Model salvat: 'dino_translator_bidirectional_emb.h5'")
