In [1]:
import os
import sys
import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)
import numpy as np
import pickle
import os



import re
from collections import Counter

# Define the Tokenize class to match what was used during training
class Tokenize:
    def __init__(self, max_tokens=None, is_bengali=False):
        self.max_tokens = max_tokens
        self.is_bengali = is_bengali
        self.word_to_idx = {
            '<PAD>': 0,
            '<START>': 1,
            '<END>': 2,
            '<UNK>': 3
        }
        self.idx_to_word = {
            0: '<PAD>',
            1: '<START>',
            2: '<END>',
            3: '<UNK>'
        }
        self.len_vocab = 4
        self.fit_done = False
        
    def fit(self, texts):
        """Build vocabulary from texts"""
        word_counts = Counter()
        
        for text in texts:
            # Clean text based on language
            cleaned = clean_text(text, self.is_bengali)
            tokens = cleaned.split()
            word_counts.update(tokens)
        
        # Get most common words if max_tokens is set
        if self.max_tokens:
            most_common = word_counts.most_common(self.max_tokens - 4)  # -4 for special tokens
        else:
            most_common = word_counts.most_common()
        
        # Add words to vocabulary
        for word, _ in most_common:
            if word not in self.word_to_idx:
                self.word_to_idx[word] = self.len_vocab
                self.idx_to_word[self.len_vocab] = word
                self.len_vocab += 1
                
        self.fit_done = True
    
    def __call__(self, texts):
        """Convert texts to tokens and ids"""
        if not self.fit_done:
            raise ValueError("You must call fit() before tokenizing texts")
        
        all_tokens = []
        all_ids = []
        
        for text in texts:
            # Clean text based on language
            cleaned = clean_text(text, self.is_bengali)
            tokens = cleaned.split()
            
            # Add start and end tokens
            tokens = ['<START>'] + tokens + ['<END>']
            
            # Convert to ids
            ids = [self.word_to_idx.get(token, self.word_to_idx['<UNK>']) for token in tokens]
            
            all_tokens.append(tokens)
            all_ids.append(ids)
            
        return all_tokens, all_ids

# Define clean_text function to match what was used during training
def clean_text(text, is_bengali=False):
    """
    Clean text by removing punctuation and normalizing whitespace.
    Handles non-string inputs safely.
    """
    # Handle non-string inputs (NaN values, integers, etc.)
    if not isinstance(text, str):
        if hasattr(text, 'isna') and text.isna():  # Handle NaN values
            return ""
        # Convert numbers or other types to string
        text = str(text)
    
    if not is_bengali:
        # English cleaning
        text = text.lower()
        text = re.sub(r"[^\w\s]", " ", text)
    else:
        # Bengali cleaning: remove punctuation, keep Bengali chars
        text = re.sub(r"[^\u0980-\u09FF\s]", " ", text)
    
    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# Define masked loss function
def masked_loss(y_true, y_pred):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)
    
    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask
    
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

# Define masked accuracy function
def masked_acc(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)
    
    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)
    
    return tf.reduce_sum(match)/tf.reduce_sum(mask)

# Define prediction function with beam search (similar to original notebook)
def predict_with_beam_search(text, model, tokenize_en, tokenize_bn, max_len, clean_text_func, start_token, end_token, beam_width=3):
    """
    Prediction function with beam search for better translation quality
    
    Args:
        text: Input English text
        model: Loaded translation model
        tokenize_en: English tokenizer
        tokenize_bn: Bengali tokenizer
        max_len: Maximum sequence length
        clean_text_func: Text cleaning function
        start_token: Start token string
        end_token: End token string
        beam_width: Width of beam search (higher = more candidates)
        
    Returns:
        Best Bengali translation
    """
    try:
        # Clean and tokenize input text
        clean_text_en = clean_text_func(text)
        _, sent_id_en = tokenize_en([clean_text_en])
        sent_id_en = sent_id_en[0]
        
        # Pad encoder input
        sent_id_en_pad = sent_id_en[:max_len] if len(sent_id_en) > max_len else sent_id_en + [0] * (max_len - len(sent_id_en))
        sent_id_en_pad = np.array([sent_id_en_pad], dtype=np.int32)
        
        # Initialize decoder input with start token
        sent_id_bn = np.zeros(max_len, dtype=np.int32)
        sent_id_bn[0] = tokenize_bn.word_to_idx[start_token]
        
        # Beam search parameters
        candidates = [(sent_id_bn, 0.0)]  # (sequence, score)
        
        for i in range(max_len - 1):
            all_candidates = []
            
            # Expand each candidate
            for seq, score in candidates:
                # If sequence already has an end token, keep it as is
                if i > 0 and seq[i] == tokenize_bn.word_to_idx[end_token]:
                    all_candidates.append((seq, score))
                    continue
                
                # Predict next token probabilities
                test_predict = model.predict(
                    [np.array([sent_id_en_pad[0]]), np.array([seq])], 
                    verbose=0)
                
                # Get top k predictions
                top_preds = np.argsort(test_predict[0][i])[-beam_width:]
                
                # Create new candidates
                for p in top_preds:
                    new_seq = seq.copy()
                    new_seq[i+1] = p
                    # Add log probability to score
                    new_score = score + np.log(test_predict[0][i][p] + 1e-10)
                    all_candidates.append((new_seq, new_score))
            
            # Select top candidates
            candidates = sorted(all_candidates, key=lambda x: x[1], reverse=True)[:beam_width]
            
            # Check if all candidates have ended
            if all(seq[i+1] == tokenize_bn.word_to_idx[end_token] for seq, _ in candidates):
                break
        
        # Select best candidate
        best_seq, _ = candidates[0]
        
        # Extract result (ignore start token, stop at end token or padding)
        result = []
        for j in range(1, max_len):
            if best_seq[j] == tokenize_bn.word_to_idx[end_token] or best_seq[j] == 0:
                break
            result.append(tokenize_bn.idx_to_word[best_seq[j]])
        
        return ' '.join(result)
            
    except Exception as e:
        print(f"Error during beam search: {e}")
        return "Translation failed."

# Constants
START_TOKEN = '<START>'
END_TOKEN = '<END>'
UNK_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'
MAX_LENGTH = 40  # Default, will be overridden if model_params is loaded
BEAM_WIDTH = 3   # Default beam width for search

# Check if required files exist
print("Checking for required files...")
required_files = ['seq2seq_model.keras']
missing_files = [f for f in required_files if not os.path.exists(f)]

if missing_files:
    print(f"Error: Missing required files: {missing_files}")
    exit(1)

# Try to load model parameters if available
try:
    if os.path.exists('model_params.pkl'):
        with open('model_params.pkl', 'rb') as f:
            model_params = pickle.load(f)
        MAX_LENGTH = model_params.get('max_length', MAX_LENGTH)
        print(f"Loaded model parameters. Using max_length={MAX_LENGTH}")
    else:
        print(f"model_params.pkl not found. Using default max_length={MAX_LENGTH}")
except Exception as e:
    print(f"Warning: Could not load model parameters: {e}")
    print(f"Using default max_length={MAX_LENGTH}")

# Load tokenizers if available, otherwise create new ones
try:
    tokenize_en = None
    tokenize_bn = None
    
    if os.path.exists('tokenize_en.pkl') and os.path.exists('tokenize_bn.pkl'):
        try:
            with open('tokenize_en.pkl', 'rb') as f:
                tokenize_en = pickle.load(f)
            with open('tokenize_bn.pkl', 'rb') as f:
                tokenize_bn = pickle.load(f)
            print("Loaded tokenizers successfully.")
        except Exception as e:
            print(f"Error loading tokenizers: {e}")
            print("Will create tokenizers from scratch.")
    
    if tokenize_en is None or tokenize_bn is None:
        print("Creating new tokenizers (Note: these won't match the trained model's vocabulary)")
        # Create minimal tokenizers for testing only
        tokenize_en = Tokenize(is_bengali=False)
        tokenize_bn = Tokenize(is_bengali=True)
        
        # Add minimal vocabulary
        for tokenizer in [tokenize_en, tokenize_bn]:
            tokenizer.fit_done = True  # Mark as fitted even though we're not really fitting
        
        print("Created test tokenizers with minimal vocabulary.")
        print("WARNING: Translations will be very limited without proper tokenizers!")
except Exception as e:
    print(f"Error setting up tokenizers: {e}")
    exit(1)

# Load the model
print("Loading model...")
try:
    # Custom objects dictionary for loading model with custom functions
    custom_objects = {
        'masked_loss': masked_loss,
        'masked_acc': masked_acc
    }
    
    model = tf.keras.models.load_model('seq2seq_model.keras', custom_objects=custom_objects)
    print("Model loaded successfully.")
except Exception as e:
    print(f"Error loading model: {e}")
    exit(1)

# Define a simple translation function using beam search
def translate(english_text):
    """Translate English text to Bengali using beam search"""
    return predict_with_beam_search(
        english_text, 
        model, 
        tokenize_en, 
        tokenize_bn, 
        MAX_LENGTH, 
        clean_text, 
        START_TOKEN, 
        END_TOKEN,
        beam_width=BEAM_WIDTH
    )

# Improved translation interface
def run_translation_interface():
    global BEAM_WIDTH
    """Run an advanced translation interface with multiple options"""
    print("\n===== English to Bengali Translation Interface =====")
    print("Options:")
    print("1. Single sentence translation")
    print("2. Batch translation from file")
    print("3. Save translations to file")
    print("4. Adjust beam width (currently: {})".format(BEAM_WIDTH))
    print("5. Toggle verbose mode (currently OFF)")
    print("6. Exit")
    
    verbose_mode = False
    
    
    while True:
        try:
            choice = input("\nEnter your choice (1-6): ")
            
            if choice == '1':
                # Single sentence translation
                text = input("\nEnter English text to translate: ")
                print(f"\nOriginal: \"{text}\"")
                
                if verbose_mode:
                    print("Processing translation with beam width {}...".format(BEAM_WIDTH))
                    # Show cleaned text in verbose mode
                    cleaned = clean_text(text)
                    print(f"Cleaned text: \"{cleaned}\"")
                
                translation = translate(text)
                print(f"Bengali translation: \"{translation}\"")
                
            elif choice == '2':
                # Batch translation from file
                filename = input("\nEnter input file path: ")
                
                try:
                    with open(filename, 'r', encoding='utf-8') as f:
                        lines = f.readlines()
                    
                    if not lines:
                        print("File is empty.")
                        continue
                    
                    print(f"\nTranslating {len(lines)} lines from {filename} with beam width {BEAM_WIDTH}...")
                    translations = []
                    
                    for i, line in enumerate(lines):
                        line = line.strip()
                        if not line:
                            continue
                            
                        if verbose_mode:
                            print(f"\nLine {i+1}: \"{line}\"")
                        
                        translation = translate(line)
                        translations.append((line, translation))
                        
                        if verbose_mode:
                            print(f"Translation: \"{translation}\"")
                    
                    print(f"\nCompleted {len(translations)} translations.")
                    
                    # Display first 5 translations
                    print("\nSample translations:")
                    for i, (original, translated) in enumerate(translations[:5]):
                        print(f"{i+1}. \"{original}\" → \"{translated}\"")
                    
                    # Ask if user wants to save results
                    save = input("\nSave translations to file? (y/n): ")
                    if save.lower() == 'y':
                        output_file = input("Enter output file path: ")
                        with open(output_file, 'w', encoding='utf-8') as f:
                            for original, translated in translations:
                                f.write(f"{original}\t{translated}\n")
                        print(f"Translations saved to {output_file}")
                        
                except FileNotFoundError:
                    print(f"File not found: {filename}")
                except Exception as e:
                    print(f"Error processing file: {e}")
                    
            elif choice == '3':
                # Save translations to file
                output_file = input("\nEnter output file path: ")
                print("Enter English sentences (one per line). Type 'DONE' on a new line when finished.")
                
                sentences = []
                while True:
                    line = input()
                    if line == 'DONE':
                        break
                    sentences.append(line)
                
                if not sentences:
                    print("No sentences provided.")
                    continue
                
                try:
                    with open(output_file, 'w', encoding='utf-8') as f:
                        for sentence in sentences:
                            translation = translate(sentence)
                            f.write(f"{sentence}\t{translation}\n")
                    print(f"\nTranslated {len(sentences)} sentences and saved to {output_file}")
                except Exception as e:
                    print(f"Error saving translations: {e}")
            
            elif choice == '4':
                # Adjust beam width
                try:
                    new_width = int(input(f"\nCurrent beam width is {BEAM_WIDTH}. Enter new beam width (1-10): "))
                    if 1 <= new_width <= 10:
                        BEAM_WIDTH = new_width
                        print(f"Beam width set to {BEAM_WIDTH}")
                    else:
                        print("Beam width must be between 1 and 10")
                except ValueError:
                    print("Please enter a valid number")
                    
            elif choice == '5':
                # Toggle verbose mode
                verbose_mode = not verbose_mode
                print(f"Verbose mode turned {'ON' if verbose_mode else 'OFF'}")
                
            elif choice == '6':
                # Exit
                print("\nExiting translation interface. Goodbye!")
                break
                
            else:
                print("Invalid choice. Please enter a number between 1 and 6.")
                
        except KeyboardInterrupt:
            print("\nOperation cancelled.")
        except Exception as e:
            print(f"Error: {e}")
            




print("\n----- Testing English to Bengali Translation -----")


# Run the interface
run_translation_interface()

2025-04-18 11:57:33.742640: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-18 11:57:33.752190: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744977453.765686  171041 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744977453.769182  171041 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744977453.778910  171041 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Checking for required files...
Loaded model parameters. Using max_length=40
Loaded tokenizers successfully.
Loading model...


I0000 00:00:1744977455.962708  171041 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3539 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9
  saveable.load_own_variables(weights_store.get(inner_path))


Model loaded successfully.

----- Testing English to Bengali Translation -----

===== English to Bengali Translation Interface =====
Options:
1. Single sentence translation
2. Batch translation from file
3. Save translations to file
4. Adjust beam width (currently: 3)
5. Toggle verbose mode (currently OFF)
6. Exit

Original: "a little girl climbed up a tree"
Bengali translation: "একটি ছোট মেয়ে গাছে উঠছে"

Original: "i love you"
Bengali translation: "আমি আপনাকে ভালোবাসি"

Original: "you are brave"
Bengali translation: "আপনারা চিৎকার করছেন"

Exiting translation interface. Goodbye!
