In [7]:
from transformers import AutoTokenizer, TFAutoModelForMaskedLM, Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling
from datasets import load_dataset
import tensorflow as tf

# Function to load the appropriate model based on selected language
def load_model(language='en'):
    if language == 'en':
        model_name = "bert-base-uncased"  # English BERT model
    elif language == 'es':
        model_name = "dccuchile/bert-base-spanish-wwm-uncased"  # Spanish BERT model
    elif language == 'tr':  # Turkish BERT model
        model_name = "dbmdz/bert-base-turkish-uncased"  # Example for Turkish
    else:
        print("Language not supported!")
        return None, None
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForMaskedLM.from_pretrained(model_name)
    return tokenizer, model

# Function to preprocess and tokenize the dataset
def tokenize_and_mask(dataset, tokenizer, max_length=512):
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=max_length)
    
    # Tokenizing the dataset
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    
    return tokenized_datasets

# Fine-tune the model on the selected dataset
def fine_tune_model(model, tokenized_datasets, tokenizer, output_dir='./results'):
    training_args = TrainingArguments(
        output_dir=output_dir,          # output directory
        num_train_epochs=3,             # number of training epochs
        per_device_train_batch_size=8,  # batch size for training
        per_device_eval_batch_size=8,   # batch size for evaluation
        warmup_steps=500,               # number of warmup steps for learning rate scheduler
        weight_decay=0.01,              # strength of weight decay
        logging_dir='./logs',           # directory for storing logs
        logging_steps=10,
        save_steps=10_000,
        eval_steps=500,
        evaluation_strategy="steps",
        save_total_limit=2,
    )

    # Prepare data collator for MLM
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,                         # the model to train
        args=training_args,                  # training arguments
        train_dataset=tokenized_datasets['train'],  # training dataset
        eval_dataset=tokenized_datasets['test'],   # evaluation dataset
        data_collator=data_collator,         # data collator
    )

    # Train the model
    trainer.train()

# Load the dataset for multi-language training
# Load and prepare dataset for the selected language
def load_and_prepare_dataset(language='en'):
    if language == 'tr':
        dataset = load_dataset("adoxalim/Tr-En-Translator")  # Dataset for Turkish to English
        dataset = dataset.map(lambda x: {"text": x["Turkish"]}, remove_columns=["English", "Turkish"])  # Extract Turkish text
    elif language == 'en':
        dataset = load_dataset("adoxalim/Tr-En-Translator")  # English part for fine-tuning in English
        dataset = dataset.map(lambda x: {"text": x["English"]}, remove_columns=["English", "Turkish"])  # Extract English text
    elif language == 'es':
        # Example: You would need a Spanish dataset to fine-tune a Spanish model
        # This is a placeholder. Load an appropriate dataset for Spanish.
        dataset = load_dataset("wmt14", "es-en")  # Example: WMT Spanish-English dataset
        dataset = dataset.map(lambda x: {"text": x["translation"]["es"]}, remove_columns=["translation"])
    else:
        print("Dataset for the selected language is not available.")
        return None

    return dataset


# Function to predict the masked word in the sentence
def predict_masked_word(input_text, language='en'):
    # Load the model and tokenizer based on the selected language
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        return "Error loading model for the selected language."
    
    # Replace '___' in the input with [MASK]
    input_text = input_text.replace('___', '[MASK]')  # Replace placeholder ___ with [MASK]
    
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="tf")
    
    # Get model predictions
    outputs = model(**inputs)
    
    # Get the logits for the masked token
    logits = outputs.logits

    # Find all indices of the [MASK] token(s)
    masked_indices = tf.where(inputs.input_ids == tokenizer.mask_token_id)
    
    # If no [MASK] token found, return an error message
    if len(masked_indices) == 0:
        return "No [MASK] token found in the input sentence."
    
    # Process each masked token (in case there are multiple [MASK] tokens)
    predicted_words = []
    for masked_index in masked_indices.numpy():
        # Convert the index to a token ID and predict the word
        predicted_token_id = tf.argmax(logits[0, masked_index[0]]).numpy()
        predicted_word = tokenizer.decode([predicted_token_id])
        predicted_words.append(predicted_word)

    return predicted_words

# Main function to fine-tune on multiple languages
def main():
    language = input("Enter language code (en for English, es for Spanish, tr for Turkish): ")
    
    # Load model and tokenizer for the selected language
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        print("Error: Model could not be loaded.")
        return
    
    # Load and prepare dataset for the selected language
    dataset = load_and_prepare_dataset(language)
    
    if dataset is None:
        print("Error: Dataset could not be loaded.")
        return
    
    # Tokenize and preprocess the dataset
    tokenized_datasets = tokenize_and_mask(dataset, tokenizer)

    # Fine-tune the model on the dataset
    fine_tune_model(model, tokenized_datasets, tokenizer)

    # Get user input for sentence and predict masked word
    input_text = input("Enter a sentence with '___' where the word is missing: ")
    
    # Call the function to predict the masked word
    predicted_word = predict_masked_word(input_text, language)

    # Output the prediction
    print(f"The predicted word(s) for the masked token(s) is/are: {predicted_word}")

if __name__ == "__main__":
    main()


All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`

In [8]:
import os
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments
import torch
from transformers import DataCollatorForLanguageModeling
from datasets import Dataset

# Function to load the appropriate model based on the selected language
def load_model(language='en'):
    if language == 'en':
        model_name = "bert-base-uncased"  # English BERT model
    elif language == 'es':
        model_name = "dccuchile/bert-base-spanish-wwm-uncased"  # Spanish BERT model
    else:
        print("Language not supported!")
        return None, None
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)
    return tokenizer, model

# Preprocess dataset for fine-tuning
def preprocess_dataset(dataset, tokenizer):
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, padding=True)

    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    return tokenized_datasets

# Function to fine-tune the model on the dataset
def fine_tune_model(model, tokenized_datasets, tokenizer, output_dir='./results'):
    training_args = TrainingArguments(
        output_dir=output_dir,          # output directory
        num_train_epochs=3,             # number of training epochs
        per_device_train_batch_size=8,  # batch size for training
        per_device_eval_batch_size=8,   # batch size for evaluation
        warmup_steps=500,               # number of warmup steps for learning rate scheduler
        weight_decay=0.01,              # strength of weight decay
        logging_dir='./logs',           # directory for storing logs
        logging_steps=10,
        evaluation_strategy="epoch",    # Evaluate at the end of each epoch
        save_strategy="epoch"           # Save checkpoint after each epoch
    )
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    trainer = Trainer(
        model=model,                         # the model to be trained
        args=training_args,                  # training arguments
        data_collator=data_collator,         # data collator
        train_dataset=tokenized_datasets['train'],    # training dataset
        eval_dataset=tokenized_datasets['test']      # evaluation dataset
    )

    trainer.train()

# Main code execution
def main():
    # Load the dataset (for multilingual translation, use appropriate dataset)
    dataset_name = "adoxalim/Tr-En-Translator"
    dataset = load_dataset(dataset_name)

    # Select the language you want to fine-tune on
    language = 'en'  # Change to 'es' for Spanish or other supported languages
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        print("Error loading model!")
        return
    
    # Preprocess the dataset
    tokenized_datasets = preprocess_dataset(dataset, tokenizer)

    # Fine-tune the model on the tokenized dataset
    fine_tune_model(model, tokenized_datasets, tokenizer)

    print(f"Model fine-tuned and saved in {os.path.join(os.getcwd(), 'results')}.")

# Call the main function
if __name__ == "__main__":
    main()


BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another archite

KeyError: 'text'

In [20]:
import os
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForMaskedLM
from datasets import load_dataset
from tensorflow.keras.optimizers import Adam

# Function to load the appropriate model based on selected language
def load_model(language='en'):
    if language == 'en':
        model_name = "bert-base-uncased"  # English BERT model
    elif language == 'tr':
        model_name = "dbmdz/bert-base-turkish-cased"  # Turkish BERT model
    elif language == 'es':
        model_name = "dccuchile/bert-base-spanish-wwm-uncased"  # Spanish BERT model
    else:
        print("Language not supported!")
        return None, None
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForMaskedLM.from_pretrained(model_name)
    return tokenizer, model

# Preprocess dataset for fine-tuning
def preprocess_dataset(dataset, tokenizer, language='tr'):
    column_name = "Turkish" if language == 'tr' else "English"  # Choose appropriate column for language

    def tokenize_function(examples):
        return tokenizer(examples[column_name], truncation=True, padding=True)

    # Tokenize the dataset
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Masking some tokens in the input for MLM task (randomly replace words with [MASK])
    def mask_tokens(examples):
        # Mask some of the input tokens for the MLM task
        input_ids = examples['input_ids']
        labels = input_ids.copy()

        # Mask tokens (randomly) in the input for MLM
        # We'll mask 15% of the tokens in each sentence
        mask_prob = 0.15
        masked_indices = [i for i, token in enumerate(input_ids) if token != tokenizer.pad_token_id]
        num_to_mask = int(len(masked_indices) * mask_prob)

        # Randomly select tokens to mask
        mask_indices = tf.random.shuffle(masked_indices)[:num_to_mask]

        for idx in mask_indices:
            input_ids[idx] = tokenizer.mask_token_id
            labels[idx] = input_ids[idx]  # Set the masked tokens as labels

        # Set non-masked tokens to -100, which means they won't contribute to the loss
        labels = [-100 if token == tokenizer.pad_token_id or token == tokenizer.mask_token_id else token for token in labels]

        return {'input_ids': input_ids, 'labels': labels, 'attention_mask': examples['attention_mask']}

    tokenized_datasets = tokenized_datasets.map(mask_tokens, batched=True)

    return tokenized_datasets

# Fine-tune the model using TensorFlow
def fine_tune_model(model, tokenized_datasets, tokenizer, output_dir='./results', num_epochs=3, batch_size=8):
    # Prepare TensorFlow dataset for training
    train_dataset = tokenized_datasets['train'].to_tf_dataset(
        columns=['input_ids', 'attention_mask'],
        label_cols=['labels'],  # Use 'labels' for MLM task
        batch_size=batch_size,
    )

    # Check if 'test' split exists
    eval_dataset = tokenized_datasets.get('test', None)
    
    # If 'test' split doesn't exist, use 'validation' if available
    if eval_dataset is None:
        eval_dataset = tokenized_datasets.get('validation', None)
    
    # If no evaluation dataset, print warning and skip evaluation
    if eval_dataset is None:
        print("Warning: No 'test' or 'validation' split found. Skipping evaluation.")

    # If an eval dataset exists, prepare it
    if eval_dataset:
        eval_dataset = eval_dataset.to_tf_dataset(
            columns=['input_ids', 'attention_mask'],
            label_cols=['labels'],
            batch_size=batch_size,
        )

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=5e-5), loss=model.compute_loss)

    # Train the model
    model.fit(train_dataset, epochs=num_epochs, validation_data=eval_dataset)

    # Save the fine-tuned model
    model.save_pretrained(output_dir)

# Function to predict the masked word in the sentence
def predict_masked_word(input_text, language='en'):
    # Load the model and tokenizer based on the selected language
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        return "Error loading model for the selected language."
    
    # Replace '___' in the input with [MASK]
    input_text = input_text.replace('___', '[MASK]')  # Replace placeholder ___ with [MASK]
    
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="tf")
    
    # Get model predictions
    outputs = model(**inputs)
    
    # Get the logits for the masked token
    logits = outputs.logits

    # Find all indices of the [MASK] token(s)
    masked_indices = tf.where(inputs.input_ids == tokenizer.mask_token_id)
    
    # If no [MASK] token found, return an error message
    if len(masked_indices) == 0:
        return "No [MASK] token found in the input sentence."
    
    # Process each masked token (in case there are multiple [MASK] tokens)
    predicted_words = []
    for masked_index in masked_indices.numpy():
        # Convert the index to a token ID and predict the word
        predicted_token_id = tf.argmax(logits[0, masked_index[0]]).numpy()
        predicted_word = tokenizer.decode([predicted_token_id])
        predicted_words.append(predicted_word)

    return predicted_words

# Main code execution
def main():
    # Load the dataset (for multilingual translation, use appropriate dataset)
    dataset_name = "adoxalim/Tr-En-Translator"
    dataset = load_dataset(dataset_name)

    # Select the language you want to fine-tune on
    language = 'tr'  # Change to 'en' for English or other supported languages
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        print("Error loading model!")
        return
    
    # Preprocess the dataset
    tokenized_datasets = preprocess_dataset(dataset, tokenizer, language)

    # Fine-tune the model on the tokenized dataset
    fine_tune_model(model, tokenized_datasets, tokenizer)

    print(f"Model fine-tuned and saved in {os.path.join(os.getcwd(), 'results')}.")
    
    # Get user input for sentence and language selection
    input_text = input("Enter a sentence with '___' where the word is missing: ")
    language = input("Enter language code (en for English, tr for Turkish, es for Spanish): ")

    # Call the function to predict the masked word
    predicted_word = predict_masked_word(input_text, language)

    # Output the prediction
    print(f"The predicted word(s) for the masked token(s) is/are: {predicted_word}")

# Call the main function
if __name__ == "__main__":
    main()


All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
Map:   0%|          | 0/473035 [00:00<?, ? examples/s]


ArrowInvalid: cannot mix list and non-list, non-null values

In [21]:
import os
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForMaskedLM
from datasets import load_dataset
from tensorflow.keras.optimizers import Adam

# Function to load the appropriate model based on selected language
def load_model(language='en'):
    if language == 'en':
        model_name = "bert-base-uncased"  # English BERT model
    elif language == 'tr':
        model_name = "dbmdz/bert-base-turkish-cased"  # Turkish BERT model
    elif language == 'es':
        model_name = "dccuchile/bert-base-spanish-wwm-uncased"  # Spanish BERT model
    else:
        print("Language not supported!")
        return None, None
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForMaskedLM.from_pretrained(model_name)
    return tokenizer, model

# Preprocess dataset for fine-tuning
def preprocess_dataset(dataset, tokenizer, language='tr'):
    column_name = "Turkish" if language == 'tr' else "English"  # Choose appropriate column for language

    def tokenize_function(examples):
        return tokenizer(examples[column_name], truncation=True, padding=True)

    # Tokenize the dataset
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Masking some tokens in the input for MLM task (randomly replace words with [MASK])
    def mask_tokens(examples):
        input_ids = examples['input_ids']
        labels = input_ids.copy()

        # Mask tokens (randomly) in the input for MLM
        mask_prob = 0.15
        masked_indices = [i for i, token in enumerate(input_ids) if token != tokenizer.pad_token_id]
        num_to_mask = int(len(masked_indices) * mask_prob)

        # Randomly select tokens to mask
        mask_indices = tf.random.shuffle(masked_indices)[:num_to_mask]

        for idx in mask_indices:
            input_ids[idx] = tokenizer.mask_token_id
            labels[idx] = input_ids[idx]  # Set the masked tokens as labels

        # Set non-masked tokens to -100, which means they won't contribute to the loss
        labels = [-100 if token == tokenizer.pad_token_id or token == tokenizer.mask_token_id else token for token in labels]

        return {'input_ids': input_ids, 'labels': labels, 'attention_mask': examples['attention_mask']}

    tokenized_datasets = tokenized_datasets.map(mask_tokens, batched=True)

    return tokenized_datasets

# Fine-tune the model using TensorFlow
def fine_tune_model(model, tokenized_datasets, tokenizer, output_dir='./results', num_epochs=3, batch_size=8):
    # Prepare TensorFlow dataset for training
    train_dataset = tokenized_datasets['train'].to_tf_dataset(
        columns=['input_ids', 'attention_mask'],
        label_cols=['labels'],  # Use 'labels' for MLM task
        batch_size=batch_size,
    )

    # Prepare evaluation dataset if available
    eval_dataset = tokenized_datasets.get('test', None)
    if eval_dataset is None:
        eval_dataset = tokenized_datasets.get('validation', None)
    
    if eval_dataset:
        eval_dataset = eval_dataset.to_tf_dataset(
            columns=['input_ids', 'attention_mask'],
            label_cols=['labels'],
            batch_size=batch_size,
        )

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=5e-5), loss=model.compute_loss)

    # Train the model
    model.fit(train_dataset, epochs=num_epochs, validation_data=eval_dataset)

    # Save the fine-tuned model
    model.save_pretrained(output_dir)

# Function to predict the masked word in the sentence
def predict_masked_word(input_text, language='en'):
    # Load the model and tokenizer based on the selected language
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        return "Error loading model for the selected language."
    
    # Replace '___' in the input with [MASK]
    input_text = input_text.replace('___', '[MASK]')  # Replace placeholder ___ with [MASK]
    
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="tf")
    
    # Get model predictions
    outputs = model(**inputs)
    
    # Get the logits for the masked token
    logits = outputs.logits

    # Find all indices of the [MASK] token(s)
    masked_indices = tf.where(inputs.input_ids == tokenizer.mask_token_id)
    
    # If no [MASK] token found, return an error message
    if len(masked_indices) == 0:
        return "No [MASK] token found in the input sentence."
    
    # Process each masked token (in case there are multiple [MASK] tokens)
    predicted_words = []
    for masked_index in masked_indices.numpy():
        # Convert the index to a token ID and predict the word
        predicted_token_id = tf.argmax(logits[0, masked_index[0]]).numpy()
        predicted_word = tokenizer.decode([predicted_token_id])
        predicted_words.append(predicted_word)

    return predicted_words

# Main code execution
def main():
    # Load the dataset (for multilingual translation, use appropriate dataset)
    dataset_name = "adoxalim/Tr-En-Translator"  # Example dataset (change to your dataset)
    dataset = load_dataset(dataset_name)

    # Select the language you want to fine-tune on
    language = 'tr'  # Change to 'en' for English or other supported languages
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        print("Error loading model!")
        return
    
    # Preprocess the dataset
    tokenized_datasets = preprocess_dataset(dataset, tokenizer, language)

    # Fine-tune the model on the tokenized dataset
    fine_tune_model(model, tokenized_datasets, tokenizer)

    print(f"Model fine-tuned and saved in {os.path.join(os.getcwd(), 'results')}.")
    
    # Get user input for sentence and language selection
    input_text = input("Enter a sentence with '___' where the word is missing: ")
    language = input("Enter language code (en for English, tr for Turkish, es for Spanish): ")

    # Call the function to predict the masked word
    predicted_word = predict_masked_word(input_text, language)

    # Output the prediction
    print(f"The predicted word(s) for the masked token(s) is/are: {predicted_word}")

# Call the main function
if __name__ == "__main__":
    main()


All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
Map:   0%|          | 0/473035 [00:00<?, ? examples/s]


ArrowInvalid: cannot mix list and non-list, non-null values

In [22]:
import os
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForMaskedLM
from datasets import load_dataset
from tensorflow.keras.optimizers import Adam

# Function to load the appropriate model based on selected language
def load_model(language='en'):
    if language == 'en':
        model_name = "bert-base-uncased"  # English BERT model
    elif language == 'tr':
        model_name = "dbmdz/bert-base-turkish-cased"  # Turkish BERT model
    elif language == 'es':
        model_name = "dccuchile/bert-base-spanish-wwm-uncased"  # Spanish BERT model
    else:
        print("Language not supported!")
        return None, None
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForMaskedLM.from_pretrained(model_name)
    return tokenizer, model

# Preprocess dataset for fine-tuning
def preprocess_dataset(dataset, tokenizer, language='tr'):
    column_name = "Turkish" if language == 'tr' else "English"  # Choose appropriate column for language

    def tokenize_function(examples):
        # Ensure that the column is correctly handled as a list of strings
        if isinstance(examples[column_name], str):
            examples[column_name] = [examples[column_name]]  # Ensure it is a list
        return tokenizer(examples[column_name], truncation=True, padding=True)

    # Tokenize the dataset
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Masking some tokens in the input for MLM task (randomly replace words with [MASK])
    def mask_tokens(examples):
        input_ids = examples['input_ids']
        labels = input_ids.copy()

        # Mask tokens (randomly) in the input for MLM
        mask_prob = 0.15
        masked_indices = [i for i, token in enumerate(input_ids) if token != tokenizer.pad_token_id]
        num_to_mask = int(len(masked_indices) * mask_prob)

        # Randomly select tokens to mask
        mask_indices = tf.random.shuffle(masked_indices)[:num_to_mask]

        for idx in mask_indices:
            input_ids[idx] = tokenizer.mask_token_id
            labels[idx] = input_ids[idx]  # Set the masked tokens as labels

        # Set non-masked tokens to -100, which means they won't contribute to the loss
        labels = [-100 if token == tokenizer.pad_token_id or token == tokenizer.mask_token_id else token for token in labels]

        return {'input_ids': input_ids, 'labels': labels, 'attention_mask': examples['attention_mask']}

    tokenized_datasets = tokenized_datasets.map(mask_tokens, batched=True)

    return tokenized_datasets

# Fine-tune the model using TensorFlow
def fine_tune_model(model, tokenized_datasets, tokenizer, output_dir='./results', num_epochs=3, batch_size=8):
    # Prepare TensorFlow dataset for training
    train_dataset = tokenized_datasets['train'].to_tf_dataset(
        columns=['input_ids', 'attention_mask'],
        label_cols=['labels'],  # Use 'labels' for MLM task
        batch_size=batch_size,
    )

    # Check if 'test' split exists
    eval_dataset = tokenized_datasets.get('test', None)
    
    # If 'test' split doesn't exist, use 'validation' if available
    if eval_dataset is None:
        eval_dataset = tokenized_datasets.get('validation', None)
    
    # If no evaluation dataset, print warning and skip evaluation
    if eval_dataset is None:
        print("Warning: No 'test' or 'validation' split found. Skipping evaluation.")

    # If an eval dataset exists, prepare it
    if eval_dataset:
        eval_dataset = eval_dataset.to_tf_dataset(
            columns=['input_ids', 'attention_mask'],
            label_cols=['labels'],
            batch_size=batch_size,
        )

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=5e-5), loss=model.compute_loss)

    # Train the model
    model.fit(train_dataset, epochs=num_epochs, validation_data=eval_dataset)

    # Save the fine-tuned model
    model.save_pretrained(output_dir)

# Function to predict the masked word in the sentence
def predict_masked_word(input_text, language='en'):
    # Load the model and tokenizer based on the selected language
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        return "Error loading model for the selected language."
    
    # Replace '___' in the input with [MASK]
    input_text = input_text.replace('___', '[MASK]')  # Replace placeholder ___ with [MASK]
    
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="tf")
    
    # Get model predictions
    outputs = model(**inputs)
    
    # Get the logits for the masked token
    logits = outputs.logits

    # Find all indices of the [MASK] token(s)
    masked_indices = tf.where(inputs.input_ids == tokenizer.mask_token_id)
    
    # If no [MASK] token found, return an error message
    if len(masked_indices) == 0:
        return "No [MASK] token found in the input sentence."
    
    # Process each masked token (in case there are multiple [MASK] tokens)
    predicted_words = []
    for masked_index in masked_indices.numpy():
        # Convert the index to a token ID and predict the word
        predicted_token_id = tf.argmax(logits[0, masked_index[0]]).numpy()
        predicted_word = tokenizer.decode([predicted_token_id])
        predicted_words.append(predicted_word)

    return predicted_words

# Main code execution
def main():
    # Load the dataset (for multilingual translation, use appropriate dataset)
    dataset_name = "adoxalim/Tr-En-Translator"
    dataset = load_dataset(dataset_name)

    # Select the language you want to fine-tune on
    language = 'tr'  # Change to 'en' for English or other supported languages
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        print("Error loading model!")
        return
    
    # Preprocess the dataset
    tokenized_datasets = preprocess_dataset(dataset, tokenizer, language)

    # Fine-tune the model on the tokenized dataset
    fine_tune_model(model, tokenized_datasets, tokenizer)

    print(f"Model fine-tuned and saved in {os.path.join(os.getcwd(), 'results')}.")
    
    # Get user input for sentence and language selection
    input_text = input("Enter a sentence with '___' where the word is missing: ")
    language = input("Enter language code (en for English, tr for Turkish, es for Spanish): ")

    # Call the function to predict the masked word
    predicted_word = predict_masked_word(input_text, language)

    # Output the prediction
    print(f"The predicted word(s) for the masked token(s) is/are: {predicted_word}")

# Call the main function
if __name__ == "__main__":
    main()


All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
Map: 100%|██████████| 473035/473035 [00:09<00:00, 48168.33 examples/s]
Map:   0%|          | 0/473035 [00:00<?, ? examples/s]


ArrowInvalid: cannot mix list and non-list, non-null values

In [23]:
import os
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForMaskedLM
from datasets import load_dataset
from tensorflow.keras.optimizers import Adam

# Function to load the appropriate model based on selected language
def load_model(language='en'):
    if language == 'en':
        model_name = "bert-base-uncased"  # English BERT model
    elif language == 'tr':
        model_name = "dbmdz/bert-base-turkish-cased"  # Turkish BERT model
    elif language == 'es':
        model_name = "dccuchile/bert-base-spanish-wwm-uncased"  # Spanish BERT model
    else:
        print("Language not supported!")
        return None, None
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForMaskedLM.from_pretrained(model_name)
    return tokenizer, model

# Preprocess dataset for fine-tuning
def preprocess_dataset(dataset, tokenizer, language='tr'):
    column_name = "Turkish" if language == 'tr' else "English"  # Choose appropriate column for language

    def tokenize_function(examples):
        # Ensure that the column is correctly handled as a list of strings
        if isinstance(examples[column_name], str):
            examples[column_name] = [examples[column_name]]  # Ensure it is a list of strings
        return tokenizer(examples[column_name], truncation=True, padding=True)

    # Tokenize the dataset
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Masking some tokens in the input for MLM task (randomly replace words with [MASK])
    def mask_tokens(examples):
        input_ids = examples['input_ids']
        labels = input_ids.copy()

        # Mask tokens (randomly) in the input for MLM
        mask_prob = 0.15
        masked_indices = [i for i, token in enumerate(input_ids) if token != tokenizer.pad_token_id]
        num_to_mask = int(len(masked_indices) * mask_prob)

        # Randomly select tokens to mask
        mask_indices = tf.random.shuffle(masked_indices)[:num_to_mask]

        for idx in mask_indices:
            input_ids[idx] = tokenizer.mask_token_id
            labels[idx] = input_ids[idx]  # Set the masked tokens as labels

        # Set non-masked tokens to -100, which means they won't contribute to the loss
        labels = [-100 if token == tokenizer.pad_token_id or token == tokenizer.mask_token_id else token for token in labels]

        return {'input_ids': input_ids, 'labels': labels, 'attention_mask': examples['attention_mask']}

    tokenized_datasets = tokenized_datasets.map(mask_tokens, batched=True)

    return tokenized_datasets

# Fine-tune the model using TensorFlow
def fine_tune_model(model, tokenized_datasets, tokenizer, output_dir='./results', num_epochs=3, batch_size=8):
    # Prepare TensorFlow dataset for training
    train_dataset = tokenized_datasets['train'].to_tf_dataset(
        columns=['input_ids', 'attention_mask'],
        label_cols=['labels'],  # Use 'labels' for MLM task
        batch_size=batch_size,
    )

    # Check if 'test' split exists
    eval_dataset = tokenized_datasets.get('test', None)
    
    # If 'test' split doesn't exist, use 'validation' if available
    if eval_dataset is None:
        eval_dataset = tokenized_datasets.get('validation', None)
    
    # If no evaluation dataset, print warning and skip evaluation
    if eval_dataset is None:
        print("Warning: No 'test' or 'validation' split found. Skipping evaluation.")

    # If an eval dataset exists, prepare it
    if eval_dataset:
        eval_dataset = eval_dataset.to_tf_dataset(
            columns=['input_ids', 'attention_mask'],
            label_cols=['labels'],
            batch_size=batch_size,
        )

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=5e-5), loss=model.compute_loss)

    # Train the model
    model.fit(train_dataset, epochs=num_epochs, validation_data=eval_dataset)

    # Save the fine-tuned model
    model.save_pretrained(output_dir)

# Function to predict the masked word in the sentence
def predict_masked_word(input_text, language='en'):
    # Load the model and tokenizer based on the selected language
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        return "Error loading model for the selected language."
    
    # Replace '___' in the input with [MASK]
    input_text = input_text.replace('___', '[MASK]')  # Replace placeholder ___ with [MASK]
    
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="tf")
    
    # Get model predictions
    outputs = model(**inputs)
    
    # Get the logits for the masked token
    logits = outputs.logits

    # Find all indices of the [MASK] token(s)
    masked_indices = tf.where(inputs.input_ids == tokenizer.mask_token_id)
    
    # If no [MASK] token found, return an error message
    if len(masked_indices) == 0:
        return "No [MASK] token found in the input sentence."
    
    # Process each masked token (in case there are multiple [MASK] tokens)
    predicted_words = []
    for masked_index in masked_indices.numpy():
        # Convert the index to a token ID and predict the word
        predicted_token_id = tf.argmax(logits[0, masked_index[0]]).numpy()
        predicted_word = tokenizer.decode([predicted_token_id])
        predicted_words.append(predicted_word)

    return predicted_words

# Main code execution
def main():
    # Load the dataset (for multilingual translation, use appropriate dataset)
    dataset_name = "adoxalim/Tr-En-Translator"
    dataset = load_dataset(dataset_name)

    # Select the language you want to fine-tune on
    language = 'tr'  # Change to 'en' for English or other supported languages
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        print("Error loading model!")
        return
    
    # Preprocess the dataset
    tokenized_datasets = preprocess_dataset(dataset, tokenizer, language)

    # Fine-tune the model on the tokenized dataset
    fine_tune_model(model, tokenized_datasets, tokenizer)

    print(f"Model fine-tuned and saved in {os.path.join(os.getcwd(), 'results')}.")
    
    # Get user input for sentence and language selection
    input_text = input("Enter a sentence with '___' where the word is missing: ")
    language = input("Enter language code (en for English, tr for Turkish, es for Spanish): ")

    # Call the function to predict the masked word
    predicted_word = predict_masked_word(input_text, language)

    # Output the prediction
    print(f"The predicted word(s) for the masked token(s) is/are: {predicted_word}")

# Call the main function
if __name__ == "__main__":
    main()


All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
Map:   0%|          | 0/473035 [00:00<?, ? examples/s]


ArrowInvalid: cannot mix list and non-list, non-null values

In [25]:
import os
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForMaskedLM
from datasets import load_dataset
from tensorflow.keras.optimizers import Adam

# Function to load the appropriate model based on selected language
def load_model(language='en'):
    if language == 'en':
        model_name = "bert-base-uncased"  # English BERT model
    elif language == 'tr':
        model_name = "dbmdz/bert-base-turkish-cased"  # Turkish BERT model
    elif language == 'es':
        model_name = "dccuchile/bert-base-spanish-wwm-uncased"  # Spanish BERT model
    else:
        print("Language not supported!")
        return None, None
    
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForMaskedLM.from_pretrained(model_name)
    return tokenizer, model

# Preprocess dataset for fine-tuning
def preprocess_dataset(dataset, tokenizer, language='tr'):
    column_name = "Turkish" if language == 'tr' else "English"  # Choose appropriate column for language

    def tokenize_function(examples):
        # Ensure that the column is correctly handled as a list of strings
        if isinstance(examples[column_name], str):
            examples[column_name] = [examples[column_name]]  # Ensure it is a list of strings
        return tokenizer(examples[column_name], truncation=True, padding=True)

    # Tokenize the dataset
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Masking some tokens in the input for MLM task (randomly replace words with [MASK])
    def mask_tokens(examples):
        input_ids = examples['input_ids']
        labels = input_ids.copy()

        # Mask tokens (randomly) in the input for MLM
        mask_prob = 0.15
        masked_indices = [i for i, token in enumerate(input_ids) if token != tokenizer.pad_token_id]
        num_to_mask = int(len(masked_indices) * mask_prob)

        # Randomly select tokens to mask
        mask_indices = tf.random.shuffle(masked_indices)[:num_to_mask]

        for idx in mask_indices:
            input_ids[idx] = tokenizer.mask_token_id
            labels[idx] = input_ids[idx]  # Set the masked tokens as labels

        # Set non-masked tokens to -100, which means they won't contribute to the loss
        labels = [-100 if token == tokenizer.pad_token_id or token == tokenizer.mask_token_id else token for token in labels]

        return {'input_ids': input_ids, 'labels': labels, 'attention_mask': examples['attention_mask']}

    tokenized_datasets = tokenized_datasets.map(mask_tokens, batched=True)

    return tokenized_datasets

# Fine-tune the model using TensorFlow
def fine_tune_model(model, tokenized_datasets, tokenizer, output_dir='./results', num_epochs=3, batch_size=8):
    # Prepare TensorFlow dataset for training
    train_dataset = tokenized_datasets['train'].to_tf_dataset(
        columns=['input_ids', 'attention_mask'],
        label_cols=['labels'],  # Use 'labels' for MLM task
        batch_size=batch_size,
    )

    # Check if 'test' split exists
    eval_dataset = tokenized_datasets.get('test', None)
    
    # If 'test' split doesn't exist, use 'validation' if available
    if eval_dataset is None:
        eval_dataset = tokenized_datasets.get('validation', None)
    
    # If no evaluation dataset, print warning and skip evaluation
    if eval_dataset is None:
        print("Warning: No 'test' or 'validation' split found. Skipping evaluation.")

    # If an eval dataset exists, prepare it
    if eval_dataset:
        eval_dataset = eval_dataset.to_tf_dataset(
            columns=['input_ids', 'attention_mask'],
            label_cols=['labels'],
            batch_size=batch_size,
        )

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=5e-5), loss=model.compute_loss)

    # Train the model
    model.fit(train_dataset, epochs=num_epochs, validation_data=eval_dataset)

    # Save the fine-tuned model
    model.save_pretrained(output_dir)

# Function to predict the masked word in the sentence
def predict_masked_word(input_text, language='en'):
    # Load the model and tokenizer based on the selected language
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        return "Error loading model for the selected language."
    
    # Replace '___' in the input with [MASK]
    input_text = input_text.replace('___', '[MASK]')  # Replace placeholder ___ with [MASK]
    
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="tf")
    
    # Get model predictions
    outputs = model(**inputs)
    
    # Get the logits for the masked token
    logits = outputs.logits

    # Find all indices of the [MASK] token(s)
    masked_indices = tf.where(inputs.input_ids == tokenizer.mask_token_id)
    
    # If no [MASK] token found, return an error message
    if len(masked_indices) == 0:
        return "No [MASK] token found in the input sentence."
    
    # Process each masked token (in case there are multiple [MASK] tokens)
    predicted_words = []
    for masked_index in masked_indices.numpy():
        # Convert the index to a token ID and predict the word
        predicted_token_id = tf.argmax(logits[0, masked_index[0]]).numpy()
        predicted_word = tokenizer.decode([predicted_token_id])
        predicted_words.append(predicted_word)

    return predicted_words

# Main code execution
def main():
    # Load the dataset (for multilingual translation, use appropriate dataset)
    dataset_name = "adoxalim/Tr-En-Translator"
    dataset = load_dataset(dataset_name)

    # Select the language you want to fine-tune on
    language = 'tr'  # Change to 'en' for English or other supported languages
    tokenizer, model = load_model(language)
    
    if not tokenizer or not model:
        print("Error loading model!")
        return
    
    # Preprocess the dataset
    tokenized_datasets = preprocess_dataset(dataset, tokenizer, language)

    # Fine-tune the model on the tokenized dataset
    fine_tune_model(model, tokenized_datasets, tokenizer)

    print(f"Model fine-tuned and saved in {os.path.join(os.getcwd(), 'results')}.")
    
    # Get user input for sentence and language selection
    input_text = input("Enter a sentence with '___' where the word is missing: ")
    language = input("Enter language code (en for English, tr for Turkish, es for Spanish): ")

    # Call the function to predict the masked word
    predicted_word = predict_masked_word(input_text, language)

    # Output the prediction
    print(f"The predicted word(s) for the masked token(s) is/are: {predicted_word}")

# Call the main function
if __name__ == "__main__":
    main()


All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
Map:   0%|          | 0/473035 [00:00<?, ? examples/s]


ArrowInvalid: cannot mix list and non-list, non-null values