In [1]:
# !pip install unbabel-comet

In [2]:
# !pip install matplotlib 

In [3]:
# !pip install seaborn

In [4]:
# !pip install mlflow --ignore-installed embedchain

In [5]:
# !pip install optuna-integration lightning 

In [6]:
# !pip install accelerate 

In [7]:
# !pip install torch transformers datasets peft 

In [8]:
# !pip install nltk 

In [9]:
# !pip install plotly

In [10]:
# !pip install -U kaleido

In [127]:
# Import necessary libraries and modules
import os
os.environ['GIT_PYTHON_REFRESH'] = 'quiet'  # Suppress Git warnings
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"  # Enable CUDA launch blocking for debugging
os.environ['TORCH_USE_CUDA_DSA'] = "1"  # Enable CUDA device-side assertions
os.environ['MLFLOW_FLATTEN_PARAMS'] = 'true' # Flatten parameters parameters for logging
os.environ['WANDB_DISABLED'] = 'true'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress info messages
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'  # Disable oneDNN custom operations

import sys
sys.path.append('../py')  # Add the parent directory to the Python path
import gc
import torch
print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"Current GPU memory usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import mlflow
import optuna
import json
import time
import traceback
from IPython.display import Image, display
from accelerate import Accelerator, DistributedDataParallelKwargs
from packaging import version
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from dataset_loader import DatasetLoader
from utils import get_model, set_seed, load_config, get_device, CustomDataset
from models.llama2_decoder import Llama2Decoder  # Import Llama2Decoder model
from models.ernie_m import ErnieM  # Import ErnieM model
from evaluators.africomet_evaluator import AfriCOMETEvaluator
from classifiers.zeroshot_classifier import ZeroShotClassifier
from classifiers.codeswitch_classifier import CodeSwitchClassifier
from trainers.encoder_decoder_trainer import EncoderDecoderTrainer
from trainers.combined_encoder_decoder_trainer import CombinedEncoderDecoderTrainer
from hyperparameter_analysis import (plot_hyperparameter_importance, plot_study_optimization_history,
                                     plot_parallel_coordinate, analyze_hyperparameter_sensitivity,
                                     plot_sensitivity_analysis)

Available GPU memory: 85.06 GB
Current GPU memory usage: 59.00 GB


In [12]:
def setup_logging(config):
    """
    Set up logging configuration based on the provided config.
    
    This function initializes the logging system with the specified log level,
    format, and output file from the configuration.

    Args:
        config (dict): A dictionary containing logging configuration.

    Returns:
        logging.Logger: Configured logger object.
    """
    logging.basicConfig(
        level=getattr(logging, config['logging']['log_level']),
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        filename=config['logging']['log_file']
    )
    return logging.getLogger(__name__)

In [13]:
def clear_memory():
    """
    Clear unused memory to prevent out-of-memory errors.
    
    This function uses Python's garbage collector and PyTorch's CUDA memory 
    cache clearing (if available) to free up memory.
    """
    gc.collect()
    torch.cuda.empty_cache()
    logging.info(f"Cleared GPU memory. Current allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

In [15]:
def load_datasets(data_loader):
    """
    Load datasets using the provided DatasetLoader object.
    
    This function attempts to load all datasets specified in the configuration
    using the DatasetLoader. It includes error handling for common issues.

    Args:
        data_loader (DatasetLoader): An instance of the DatasetLoader class.

    Returns:
        dict or None: A dictionary of loaded datasets, or None if loading fails.
    """
    try:
        logging.info("Loading and preparing datasets...")
        return data_loader.load_datasets()
    except Exception as e:
        logging.error(f"Error loading datasets: {str(e)}")
        return None

In [16]:
def objective_combined(trial, encoder, decoder, encoder_tokenizer, decoder_tokenizer, train_dataset, eval_dataset, evaluator):
    """
    Objective function for hyperparameter optimization of the combined Afro-XLMR and LLaMA model, including intent recognition and slot filling tasks.

    Args:
        trial (optuna.trial.Trial): The Optuna trial object used for hyperparameter suggestions.
        encoder (torch.nn.Module): The Afro-XLMR encoder model to be trained and evaluated.
        decoder (torch.nn.Module): The LLaMA decoder model to be trained and evaluated.
        encoder_tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for the encoder.
        decoder_tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for the decoder.
        train_dataset (Dataset): The dataset used for training the model.
        eval_dataset (Dataset): The dataset used for evaluating the model.
        evaluator (Evaluator): The evaluator object used to compute evaluation metrics.

    Returns:
        float: The evaluation metric to be minimized (lower is better).
    """
    # Initialize Accelerator
    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
    accelerator = Accelerator(mixed_precision='no', kwargs_handlers=[ddp_kwargs])
    
    # Hyperparameter suggestions based on config
    try:
        hyperparams = config.get('hyperparameters', {})
        
        # Learning rates
        lr_min = float(hyperparams.get('learning_rate_min', 1e-6))
        lr_max = float(hyperparams.get('learning_rate_max', 1e-4))
        
        if lr_min >= lr_max:
            logging.error(f"Invalid learning rate range: min ({lr_min}) must be less than max ({lr_max})")
            return float('inf')
        
        encoder_lr = trial.suggest_float('encoder_lr', lr_min, lr_max, log=True)
        decoder_lr = trial.suggest_float('decoder_lr', lr_min, lr_max, log=True)
        
        # Number of training epochs
        num_train_epochs = trial.suggest_int('num_train_epochs', 
                                             int(hyperparams.get('num_train_epochs_min', 1)), 
                                             int(hyperparams.get('num_train_epochs_max', 3)))
        
        per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', 
                                                                hyperparams.get('batch_sizes', [1, 2, 4, 8]))
        
        weight_decay = trial.suggest_float('weight_decay', 
                                           float(hyperparams.get('weight_decay_min', 0.01)), 
                                           float(hyperparams.get('weight_decay_max', 0.1)), 
                                           log=True)
        
        warmup_steps = trial.suggest_int('warmup_steps', 
                                         int(hyperparams.get('warmup_steps_min', 0)), 
                                         int(hyperparams.get('warmup_steps_max', 1000)))
        
        gradient_accumulation_steps = trial.suggest_categorical('gradient_accumulation_steps', 
                                                                hyperparams.get('gradient_accumulation_steps', [1, 2, 4, 8]))
        
        # Add hyperparameters for intent and slot tasks
        intent_loss_weight = trial.suggest_float('intent_loss_weight', 0.1, 1.0)
        slot_loss_weight = trial.suggest_float('slot_loss_weight', 0.1, 1.0)

        # Create trial config
        trial_config = {
            "encoder_lr": encoder_lr,
            "decoder_lr": decoder_lr,
            "num_train_epochs": num_train_epochs,
            "per_device_train_batch_size": per_device_train_batch_size,
            "per_device_eval_batch_size": per_device_train_batch_size,
            "weight_decay": weight_decay,
            "warmup_steps": warmup_steps,
            "gradient_accumulation_steps": gradient_accumulation_steps,
            "fp16": False,
            "evaluation_strategy": "steps",
            "eval_steps": 200,
            "save_steps": 200,
            "logging_steps": 50,
            "max_grad_norm": 1.0,
            "output_dir": config['model']['output_dir'],
            "seed": config['seed'],
            "device": config['device'],
            "cache_dir": config['cache']['dir'],
            "gradient_checkpointing": True,  
            "intent_loss_weight": intent_loss_weight, 
            "slot_loss_weight": slot_loss_weight,  
            "num_intent_classes": 50,  
            "num_slot_classes": 100  
        }
    
        logging.info(f"Trial {trial.number}: Starting with hyperparameters: {trial_config}")
        
    except ValueError as e:
        logging.error(f"Error converting config values to numeric types: {str(e)}")
        return float('inf')
    except Exception as e:
        logging.error(f"Error in hyperparameter suggestion: {str(e)}")
        return float('inf')

    # Enable gradient checkpointing for both encoder and decoder
    encoder.gradient_checkpointing_enable()
    decoder.gradient_checkpointing_enable()
    
    # Initialize the trainer
    trainer = CombinedEncoderDecoderTrainer(encoder, decoder, encoder_tokenizer, decoder_tokenizer, config=trial_config, accelerator=accelerator, batch_size=64)

    # Prepare models and optimizers
    trainer.encoder, trainer.decoder, trainer.projection, trainer.encoder_optimizer, trainer.decoder_optimizer = accelerator.prepare(
        trainer.encoder, trainer.decoder, trainer.projection, trainer.encoder_optimizer, trainer.decoder_optimizer
    )

    # Log dataset sizes
    logging.info(f"Train dataset size: {len(train_dataset)}")
    logging.info(f"Eval dataset size: {len(eval_dataset)}")

    # Validate datasets
    trainer.validate_dataset(train_dataset, "Training")
    trainer.validate_dataset(eval_dataset, "Evaluation")

    best_metric = float('inf')  # Initialize for early stopping
    patience_counter = 0
    patience_threshold = trainer.patience  # Use the patience from the trainer configuration

    # Train the combined model
    for epoch in range(num_train_epochs):
        logging.info(f"Starting epoch {epoch + 1}/{num_train_epochs}")
        train_result = trainer.train(train_dataset, eval_dataset)

        # Clear cache after training
        torch.cuda.empty_cache()
        
        # Evaluate the model after each epoch
        try:
            # Use the evaluate() method here
            eval_metrics = trainer.evaluate(eval_dataset)
            
            # Generate translations for FLORES evaluation
            generated_results = trainer.generate_batch(eval_dataset)
            translated_texts, _, _ = zip(*generated_results)  # We only need translated texts here

            # Extract source texts and reference texts for FLORES evaluation
            source_texts = [encoder_tokenizer.decode(eval_dataset[i]['input_ids'], skip_special_tokens=True) for i in range(len(eval_dataset))]
            reference_texts = [encoder_tokenizer.decode(eval_dataset[i]['labels'], skip_special_tokens=True) for i in range(len(eval_dataset))]

            # Evaluate translation with FLORES
            print("Evaluating translation results with Africomet")
            translation_results = evaluator.evaluate(source_texts, translated_texts, reference_texts)
            print("Complete evaluating translation results")

            # Combine all evaluation results
            eval_results = {
                'translation_score': translation_results.get('average_score', 0),
                'intent_accuracy': eval_metrics['intent_accuracy'],
                'slot_f1': eval_metrics['slot_f1'],
                'eval_loss': eval_metrics['eval_loss']
            }

        except Exception as e:
            logging.error(f"Error during evaluation: {str(e)}")
            logging.error(f"Traceback: {traceback.format_exc()}")
            return float('inf')
        
        logging.info(f"Epoch {epoch + 1}/{num_train_epochs} Evaluation results: {eval_results}")

        # Early stopping logic
        current_metric = eval_results['eval_loss']  # Use eval_loss for early stopping
        if current_metric < best_metric:
            best_metric = current_metric
            patience_counter = 0
            # Save the best model state
            best_model_state = {
                'encoder': trainer.encoder.state_dict(),
                'decoder': trainer.decoder.state_dict(),
                'projection': trainer.projection.state_dict()
            }
        else:
            patience_counter += 1
            logging.info(f"Early stopping patience: {patience_counter}/{patience_threshold}")

        if patience_counter >= patience_threshold:
            logging.info("Early stopping triggered.")
            break

    # Load the best model state if it exists
    if 'best_model_state' in locals():
        trainer.encoder.load_state_dict(best_model_state['encoder'])
        trainer.decoder.load_state_dict(best_model_state['decoder'])
        trainer.projection.load_state_dict(best_model_state['projection'])
        logging.info("Loaded best model state")

    # Set trial user attributes for logging
    trial.set_user_attr('intent_accuracy', eval_results['intent_accuracy'])
    trial.set_user_attr('slot_f1', eval_results['slot_f1'])

    # Return the combined metric for optimization (lower is better)
    return eval_results['translation_score'] - (eval_results['intent_accuracy'] + eval_results['slot_f1']) / 2

In [17]:
def run_combined_optimization(encoder, decoder, encoder_tokenizer, decoder_tokenizer, datasets, config, evaluator):
    """
    Run hyperparameter optimization for the combined Afro-XLMR and LLaMA model with improved efficiency, including intent recognition and slot filling tasks.
    """
    mlflow.end_run()  # End any existing runs
    
    logging.info("Starting hyperparameter optimization for combined Afro-XLMR and LLaMA (including intent and slot tasks)")
    log_gpu_memory("Before optimization")

    # Enable gradient checkpointing for both models
    encoder.gradient_checkpointing_enable()
    decoder.gradient_checkpointing_enable()

    # Check PyTorch version
    pytorch_version = version.parse(torch.__version__)
    logger.info(f"PyTorch version: {pytorch_version}")
    
    # Prepare the objective function with fixed arguments and memory management
    def memory_managed_objective(trial):
        clear_memory()
        result = objective_combined(
            trial, encoder, decoder, encoder_tokenizer, decoder_tokenizer,
            datasets['train'], datasets['eval'], evaluator
        )
        clear_memory()
        return result

    total_delay = 0
    n_trials = config['hyperparameters']['n_trials']
    
    try:
        # Set up MLflow
        experiment_name = "combined_optimization"
        try:
            experiment_id = mlflow.create_experiment(experiment_name)
        except mlflow.exceptions.MlflowException:
            experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
        
        mlflow.set_experiment(experiment_name)
        
        with mlflow.start_run(run_name="optimization_combined_with_intent_slot"):
            study = optuna.create_study(direction='minimize')

            for trial_num in range(n_trials):
                start_time = time.time()
                
                study.optimize(
                    memory_managed_objective,
                    n_trials=n_trials, 
                    timeout=3600,
                    catch=(Exception,),
                    n_jobs=1
                )
                
                end_time = time.time()
                trial_time = end_time - start_time
                
                if trial_num < n_trials - 1:  # Don't measure delay after the last trial
                    delay_start = time.time()
                    clear_memory()
                    delay_end = time.time()
                    delay = delay_end - delay_start
                    total_delay += delay
                    logging.info(f"Delay after trial {trial_num + 1}: {delay:.2f} seconds")
                
                logging.info(f"Trial {trial_num + 1} completed in {trial_time:.2f} seconds")
                log_gpu_memory(f"After trial {trial_num + 1}")

                # Log intermediate results
                if study.trials[-1].state == optuna.trial.TrialState.COMPLETE:
                    last_trial = study.trials[-1]
                    logging.info(f"Trial {trial_num + 1} results:")
                    logging.info(f"  Translation score: {last_trial.value:.4f}")
                    
                    intent_accuracy = last_trial.user_attrs.get('intent_accuracy', 'N/A')
                    slot_f1 = last_trial.user_attrs.get('slot_f1', 'N/A')
                    
                    logging.info(f"  Intent accuracy: {intent_accuracy if isinstance(intent_accuracy, str) else f'{intent_accuracy:.4f}'}")
                    logging.info(f"  Slot F1 score: {slot_f1 if isinstance(slot_f1, str) else f'{slot_f1:.4f}'}")
        
            if study.best_trial:
                log_best_params(study)
                
                # Log best results for all tasks
                logging.info("Best trial results:")
                logging.info(f"  Translation score: {study.best_trial.value:.4f}")
                
                best_intent_accuracy = study.best_trial.user_attrs.get('intent_accuracy', 'N/A')
                best_slot_f1 = study.best_trial.user_attrs.get('slot_f1', 'N/A')
                
                logging.info(f"  Intent accuracy: {best_intent_accuracy if isinstance(best_intent_accuracy, str) else f'{best_intent_accuracy:.4f}'}")
                logging.info(f"  Slot F1 score: {best_slot_f1 if isinstance(best_slot_f1, str) else f'{best_slot_f1:.4f}'}")
            else:
                logging.warning("No completed trials found.")
                
    except optuna.exceptions.OptunaError as e:
        logging.error(f"Optuna error during hyperparameter optimization: {str(e)}")
        return None
    except Exception as e:
        logging.error(f"Error during hyperparameter optimization: {str(e)}")
        logging.exception("Exception details:")
        return None
    finally:
        avg_delay = total_delay / (n_trials - 1) if n_trials > 1 else 0
        logging.info(f"Average delay between trials: {avg_delay:.2f} seconds")
        clear_memory()
    
    return study

For testing purposes only

In [18]:
# def run_combined_optimization(encoder, decoder, encoder_tokenizer, decoder_tokenizer, datasets, config, evaluator):
#     """
#     Run a single trial with a small batch for testing purposes.
#     """
#     mlflow.end_run()  # End any existing runs
    
#     logging.info("Starting single trial test for combined Afro-XLMR and LLaMA (including intent and slot tasks)")
#     log_gpu_memory("Before optimization")

#     # Enable gradient checkpointing for both models
#     encoder.gradient_checkpointing_enable()
#     decoder.gradient_checkpointing_enable()

#     # Prepare the objective function with fixed arguments and memory management
#     def memory_managed_objective(trial):
#         clear_memory()
#         result = objective_combined(
#             trial, encoder, decoder, encoder_tokenizer, decoder_tokenizer,
#             datasets['train'], datasets['eval'], evaluator
#         )
#         clear_memory()
#         return result

#     try:
#         # Set up MLflow
#         experiment_name = "combined_optimization_test"
#         try:
#             experiment_id = mlflow.create_experiment(experiment_name)
#         except mlflow.exceptions.MlflowException:
#             experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
        
#         mlflow.set_experiment(experiment_name)
        
#         with mlflow.start_run(run_name="single_trial_test"):
#             study = optuna.create_study(direction='minimize')

#             # Run a single trial
#             study.optimize(
#                 memory_managed_objective,
#                 n_trials=1,
#                 timeout=3600,
#                 catch=(Exception,),
#                 n_jobs=1
#             )

#             if study.trials[-1].state == optuna.trial.TrialState.COMPLETE:
#                 last_trial = study.trials[-1]
#                 logging.info("Trial results:")
#                 logging.info(f"  Translation score: {last_trial.value:.4f}")
#                 intent_accuracy = last_trial.user_attrs.get('intent_accuracy', 'N/A')
#                 slot_f1 = last_trial.user_attrs.get('slot_f1', 'N/A')
#                 logging.info(f"  Intent accuracy: {intent_accuracy if isinstance(intent_accuracy, str) else f'{intent_accuracy:.4f}'}")
#                 logging.info(f"  Slot F1 score: {slot_f1 if isinstance(slot_f1, str) else f'{slot_f1:.4f}'}")
#             else:
#                 logging.warning("Trial was not completed successfully.")

#     except Exception as e:
#         logging.error(f"Error during single trial test: {str(e)}")
#         logging.exception("Exception details:")
#         return None
#     finally:
#         clear_memory()
    
#     return study

In [19]:
def log_gpu_memory(stage):
    """Log GPU memory information."""
    if torch.cuda.is_available():
        current_device = torch.cuda.current_device()
        total_memory = torch.cuda.get_device_properties(current_device).total_memory
        allocated_memory = torch.cuda.memory_allocated(current_device)
        reserved_memory = torch.cuda.memory_reserved(current_device)
        available_memory = total_memory - allocated_memory - reserved_memory
        logging.info(f"{stage} - GPU Memory (GB): "
                    f"Total: {total_memory / 1e9:.2f}, "
                    f"Allocated: {allocated_memory / 1e9:.2f}, "
                    f"Reserved: {reserved_memory / 1e9:.2f}, "
                    f"Available: {available_memory / 1e9:.2f}")
    else:
        logging.info("CUDA is not available. Cannot log GPU memory.")



In [20]:
def log_best_params(study):
    """Log best parameters from the study."""
    best_params = study.best_params
    for param, value in best_params.items():
        mlflow.log_param(f"best_{param}", value)
    mlflow.log_metric("best_score", study.best_value)



In [21]:
def save_results(model_name, study, config):
    """Save the optimization results to disk."""
    output_dir = os.path.join(config['model']['output_dir'], 'optimization_results')
    os.makedirs(output_dir, exist_ok=True)
    result = {
        'best_params': study.best_params,
        'best_value': study.best_value,
        'best_trial': study.best_trial.number
    }
    with open(os.path.join(output_dir, f"{model_name}_optimization_results.json"), 'w') as f:
        json.dump(result, f, indent=2)

In [22]:
def print_best_trial_info(trial):
    """Print detailed information about the best trial."""
    print("\nBest Trial Information:")
    print(f"  Value: {trial.value}")
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    print(f"  Trial number: {trial.number}")
    print(f"  DateTime start: {trial.datetime_start}")
    print(f"  DateTime complete: {trial.datetime_complete}")

In [155]:
def evaluate_translation(model, tokenizer, datasets, evaluator):
    """
    Evaluate translation performance using the FLORES-200 dataset structure.
    
    Args:
        model (torch.nn.Module): The translation model to evaluate.
        tokenizer: The tokenizer for the model.
        datasets (Dict[str, Any]): The dictionary containing all datasets.
        evaluator (AfriCOMETEvaluator): The evaluator to use.
    Returns:
        dict: A dictionary containing evaluation results.
    """
    results = {'translations': {}}
    target_languages = ['swh', 'kin', 'lug']  # ISO codes for Swahili, Kinyarwanda, and Luganda
    english_code = 'eng'
    
    flores_dataset = datasets
    
    # Log dataset information
    for split, df in flores_dataset.items():
        logging.info(f"FLORES-200 '{split}' dataset shape: {df.shape}")
        logging.info(f"Source languages in '{split}': {df['src_lang'].unique()}")
        logging.info(f"Target languages in '{split}': {df['tgt_lang'].unique()}")
    
    for split, df in flores_dataset.items():
        results['translations'][split] = {}
        for lang in target_languages:
            try:
                eng_to_lang = df[(df['src_lang'] == english_code) & (df['tgt_lang'] == lang)]
                
                if eng_to_lang.empty:
                    logging.warning(f"No data found for {english_code} to {lang} translation in '{split}'. Skipping.")
                    continue
                
                eng_texts = eng_to_lang['src_text'].tolist()
                lang_texts = eng_to_lang['tgt_text'].tolist()
                
                logging.info(f"Translating {len(eng_texts)} sentences from {english_code} to {lang} in '{split}'")
                
                inputs = tokenizer(eng_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(model.device)
                with torch.no_grad():
                    generated = model.generate(**inputs, max_length=128)
                translations = tokenizer.batch_decode(generated, skip_special_tokens=True)
                
                scores = evaluator.evaluate(eng_texts, translations, lang_texts)
                results['translations'][split][f'{english_code}_to_{lang}'] = scores
                
                logging.info(f"Translation scores for {english_code} to {lang} in '{split}': {scores}")
            
            except Exception as e:
                logging.error(f"Error during translation evaluation for language {lang} in '{split}': {str(e)}")
                results['translations'][split][f'{english_code}_to_{lang}'] = {'error': str(e)}
    
    # Calculate average score across all splits and language pairs
    all_scores = [score['average_score'] for split_scores in results['translations'].values()
                  for score in split_scores.values() 
                  if isinstance(score, dict) and 'average_score' in score]
    
    if all_scores:
        results['average_score'] = np.mean(all_scores)
        logging.info(f"Overall average translation score: {results['average_score']}")
    else:
        results['average_score'] = np.nan
        logging.warning("No valid scores for translations")
    
    return results

In [24]:
def evaluate_zero_shot(zero_shot_classifier):
    """
    Evaluate the model's zero-shot classification performance.
    
    Args:
        datasets (Dict[str, Any]): The dictionary of datasets from DatasetLoader.
        zero_shot_classifier (object): The zero-shot classifier object.
    
    Returns:
        dict: A dictionary containing performance metrics.
    """
    if 'experimental' not in datasets or 'zero_shot' not in datasets['experimental']:
        raise ValueError("Zero-shot dataset not found in the provided datasets.")
    
    zero_shot_data = datasets['experimental']['zero_shot']
    
    if 'text' not in zero_shot_data.columns or 'intent_label' not in zero_shot_data.columns or 'slot_labels' not in zero_shot_data.columns:
        raise ValueError("Zero-shot data must contain 'text', 'intent_label', and 'slot_labels' columns.")

    texts = zero_shot_data['text'].tolist()
    true_intents = zero_shot_data['intent_label'].tolist()
    true_slots = zero_shot_data['slot_labels'].tolist()

    # Intent classification
    intent_candidate_labels = list(set(true_intents))
    intent_results = zero_shot_classifier.classify(texts, intent_candidate_labels)
    predicted_intents = [result['labels'][0] for result in intent_results]

    # Slot filling (assuming we have a method for this in the zero_shot_classifier)
    slot_candidate_labels = list(set([slot for slots in true_slots for slot in slots]))
    slot_results = zero_shot_classifier.classify_slots(texts, slot_candidate_labels)
    predicted_slots = [result['labels'] for result in slot_results]

    # Calculate metrics
    intent_accuracy = accuracy_score(true_intents, predicted_intents)
    intent_f1 = f1_score(true_intents, predicted_intents, average='weighted')
    
    # Flatten the slot labels and predictions
    true_slots_flat = [slot for slots in true_slots for slot in slots]
    pred_slots_flat = [slot for slots in predicted_slots for slot in slots]
    slot_f1 = f1_score(true_slots_flat, pred_slots_flat, average='weighted')

    return {
        'intent_accuracy': intent_accuracy,
        'intent_f1': intent_f1,
        'slot_f1': slot_f1
    }

In [25]:
def evaluate_code_switch(code_switch_classifier):
    """
    Evaluate the model's code-switch classification performance.
    
    Args:
        datasets (Dict[str, Any]): The dictionary of datasets from DatasetLoader.
        code_switch_classifier (object): The code-switch classifier object.
    
    Returns:
        dict: A dictionary containing performance metrics.
    """

    datasets
    if 'experimental' not in datasets or 'code_switch' not in datasets['experimental']:
        raise ValueError("Code-switch dataset not found in the provided datasets.")
    
    code_switch_data = datasets['experimental']['code_switch']
    
    if 'text' not in code_switch_data.columns or 'language' not in code_switch_data.columns or 'intent_label' not in code_switch_data.columns or 'slot_labels' not in code_switch_data.columns:
        raise ValueError("Code-switch data must contain 'text', 'language', 'intent_label', and 'slot_labels' columns.")

    texts = code_switch_data['text'].tolist()
    true_languages = code_switch_data['language'].tolist()
    true_intents = code_switch_data['intent_label'].tolist()
    true_slots = code_switch_data['slot_labels'].tolist()

    # Perform code-switch classification
    classification_results = code_switch_classifier.classify(texts)
    predicted_languages = [result['predicted_language'] for result in classification_results]
    predicted_intents = [result['predicted_intent'] for result in classification_results]
    predicted_slots = [result['predicted_slots'] for result in classification_results]

    # Calculate metrics
    language_accuracy = accuracy_score(true_languages, predicted_languages)
    intent_accuracy = accuracy_score(true_intents, predicted_intents)
    intent_f1 = f1_score(true_intents, predicted_intents, average='weighted')

    # Flatten the slot labels and predictions
    true_slots_flat = [slot for slots in true_slots for slot in slots]
    pred_slots_flat = [slot for slots in predicted_slots for slot in slots]
    slot_f1 = f1_score(true_slots_flat, pred_slots_flat, average='weighted')

    # Calculate per-language accuracy
    unique_languages = list(set(true_languages))
    per_language_accuracy = {}
    for lang in unique_languages:
        lang_mask = [tl == lang for tl in true_languages]
        lang_true = [tl for tl, mask in zip(true_languages, lang_mask) if mask]
        lang_pred = [pl for pl, mask in zip(predicted_languages, lang_mask) if mask]
        per_language_accuracy[lang] = accuracy_score(lang_true, lang_pred)

    # Create confusion matrix for languages
    cm = confusion_matrix(true_languages, predicted_languages, labels=unique_languages)

    return {
        'language_accuracy': language_accuracy,
        'intent_accuracy': intent_accuracy,
        'intent_f1': intent_f1,
        'slot_f1': slot_f1,
        'per_language_accuracy': per_language_accuracy,
        'confusion_matrix': cm.tolist()
    }

In [76]:
def summarize_results(results, config):
    """
    Summarize evaluation results.
    
    This function summarizes the results from various evaluation tasks and saves
    them to files.

    Args:
        results (dict): A dictionary containing results from all evaluation tasks.
        config (dict): A dictionary containing configuration information.

    Returns:
        pd.DataFrame: A summary of evaluation results.
    """
    logging.info("Summarizing results...")

    summary = {}
    for model_name in config['model']['names']:
        summary[model_name] = {
            'translation': results['translation'].get(model_name),
            'intent_recognition': results['intent_recognition'].get(model_name),
            'slot_filling': results['slot_filling'].get(model_name),
            'zero_shot': results['zero_shot'].get(model_name, {}).get('accuracy') if 'zero_shot' in results else None,
            'code_switch': results['code_switch'].get(model_name, {}).get('accuracy') if 'code_switch' in results else None,
        }

        # Add FLORES results if available
        if 'flores' in results['translation'].get(model_name, {}):
            summary[model_name]['flores_translation'] = results['translation'][model_name]['flores'].get('average_score')

    summary_df = pd.DataFrame(summary)
    logging.info("Evaluation Results Summary:")
    logging.info(summary_df)

    # Save results
    summary_df.to_csv(f"{config['model']['output_dir']}/evaluation_results_summary.csv")
    
    with open(f"{config['model']['output_dir']}/all_results.txt", 'w') as f:
        f.write(str(results))

    # Add this line to add summary to results
    results['summary'] = summary

    return summary_df

In [27]:
def plot_results(results, config):
    """
    Plot evaluation results.
    
    Args:
        results (dict): A dictionary containing results from all evaluation tasks.
        config (dict): A dictionary containing configuration information.
    """
    for model_name in config['model']['names']:
        plt.figure(figsize=(12, 8))
        
        if model_name in results['translation']:
            plt.bar(results['translation'][model_name].keys(), results['translation'][model_name].values())
            plt.title(f"Translation Scores - {model_name}")
            plt.tight_layout()
            plt.savefig(f"{config['model']['output_dir']}/{model_name}_translation_scores.png")
            plt.close()
        
        if model_name in results['generation']:
            plt.hist(results['generation'][model_name]['perplexities'], bins=20)
            plt.title(f"Perplexity Distribution - {model_name}")
            plt.xlabel("Perplexity")
            plt.ylabel("Frequency")
            plt.tight_layout()
            plt.savefig(f"{config['model']['output_dir']}/{model_name}_perplexity_distribution.png")
            plt.close()
        
        # Plot intent recognition confusion matrix
        if model_name in results['intent_recognition']:
            sns.heatmap(results['intent_recognition'][model_name]['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
            plt.title(f"Intent Recognition Confusion Matrix - {model_name}")
            plt.tight_layout()
            plt.savefig(f"{config['model']['output_dir']}/{model_name}_intent_confusion_matrix.png")
            plt.close()

        # Plot slot filling F1 scores
        if model_name in results['slot_filling']:
            plt.bar(results['slot_filling'][model_name]['f1_scores'].keys(), results['slot_filling'][model_name]['f1_scores'].values())
            plt.title(f"Slot Filling F1 Scores - {model_name}")
            plt.tight_layout()
            plt.savefig(f"{config['model']['output_dir']}/{model_name}_slot_f1_scores.png")
            plt.close()


    # Plot overall performance comparison
    plt.figure(figsize=(12, 6))
    performance_data = {
        model: {
            'Classification': results['classification'].get(model, {}).get('accuracy', 0),
            'Translation': results['translation'].get(model, {}).get('average_score', 0),
            'Generation': 1 / results['generation'].get(model, {}).get('average_perplexity', 1),  # Inverse of perplexity
            'Zero-shot': results['zero_shot'].get(model, {}).get('accuracy', 0),
            'Code-switch': results['code_switch'].get(model, {}).get('accuracy', 0),
            'Intent Recognition': results['intent_recognition'].get(model, {}).get('accuracy', 0),  # New
            'Slot Filling': results['slot_filling'].get(model, {}).get('f1_score', 0)  # New
        } for model in config['model']['names']
    }
    df = pd.DataFrame(performance_data).T
    sns.heatmap(df, annot=True, cmap='YlGnBu')
    plt.title("Model Performance Across Tasks")
    plt.tight_layout()
    plt.savefig(f"{config['model']['output_dir']}/overall_performance_heatmap.png")
    plt.close()

In [78]:
def log_results_to_mlflow(results, config, best_params):
    """
    Log results to MLflow.
    
    This function logs the evaluation results, model parameters, and artifacts to MLflow.

    Args:
        results (dict): A dictionary containing results from all evaluation tasks.
        config (dict): A dictionary containing configuration information.
        best_params (dict): A dictionary of the best hyperparameters found during optimization.
    """
    with mlflow.start_run():
        # Log hyperparameters for each model
        for model_name, params in best_params.items():
            for param, value in params.items():
                mlflow.log_param(f"{model_name}_{param}", value)
        
        # Log model information
        for model_name in config['model']['names']:
            mlflow.log_param(f"{model_name}_model", model_name)

        # Log metrics
        for model_name, metrics in results['summary'].items():
            for metric, value in metrics.items():
                if value is not None:
                    mlflow.log_metric(f"{model_name}_{metric}", value)

        # Log artifacts
        mlflow.log_artifact(f"{config['model']['output_dir']}/evaluation_results_summary.csv")
        mlflow.log_artifact(f"{config['model']['output_dir']}/all_results.txt")
        mlflow.log_artifact(f"{config['model']['output_dir']}/overall_performance_heatmap.png")
        
        # Log hyperparameter optimization plots
        for model_name in config['model']['names']:   
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_hyperparameter_importance.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_optimization_history.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_parallel_coordinate.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_sensitivity_analysis.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_intent_confusion_matrix.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_slot_f1_scores.png")

In [80]:
print(results)

{'classification': {}, 'translation': {}, 'generation': {}, 'zero_shot': {}, 'code_switch': {}, 'intent_recognition': {}, 'slot_filling': {}, 'hyperparameter_studies': {'combined_study': <optuna.study.study.Study object at 0x7f8f262249d0>}, 'summary': {'afro-xlmr-large': {'translation': None, 'intent_recognition': None, 'slot_filling': None, 'zero_shot': None, 'code_switch': None}, 'meta-llama/Llama-2-7b-hf': {'translation': None, 'intent_recognition': None, 'slot_filling': None, 'zero_shot': None, 'code_switch': None}}}


In [29]:
def print_results_summary(results_summary, best_params):
    """
    Print a summary of the evaluation results and best hyperparameters.

    Args:
        results_summary (dict): A dictionary containing summarized results.
        best_params (dict): A dictionary of the best hyperparameters found during optimization.
    """
    print("\n===== EVALUATION RESULTS SUMMARY =====")

    if 'classification' in results_summary:
        print("\nClassification Results:")
        for dataset, metrics in results_summary['classification'].items():
            print(f"\n{dataset}:")
            for metric, value in metrics.items():
                print(f"  {metric}: {value:.4f}")

    if 'translation' in results_summary:
        print("\nTranslation Results:")
        print(f"  FLORES-200 Average AfriCOMET Score (A to B): {results_summary['translation']['a_to_b']['average_score']:.4f}")
        print(f"  FLORES-200 Average AfriCOMET Score (B to A): {results_summary['translation']['b_to_a']['average_score']:.4f}")

    if 'generation' in results_summary:
        print("\nGeneration Results:")
        print(f"  FLORES-200 Average Perplexity: {results_summary['generation']['average_perplexity']:.4f}")

    if 'zero_shot' in results_summary:
        print("\nZero-shot Results:")
        print(f"  Accuracy: {results_summary['zero_shot']['accuracy']:.4f}")

    if 'code_switch' in results_summary:
        print("\nCode-switch Results:")
        print(f"  Accuracy: {results_summary['code_switch']['accuracy']:.4f}")

    print("\nBest Hyperparameters:")
    for param, value in best_params.items():
        print(f"  {param}: {value}")

    print("\n======================================")

In [30]:
# Load configuration
config = load_config('../py/config.yaml')
# Set the device dynamically based on availability
config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'  # Update device setting
auth_token = config.get("auth_token")
cache_dir = os.path.abspath(config['cache']['dir'])
logger = setup_logging(config)
set_seed(config['seed'])
device = get_device(config['device'])
logger.info(f"Using device: {device}")

Using device: cuda


In [31]:
# Ensure the cache directory exists
os.makedirs(cache_dir, exist_ok=True)

In [32]:
# Load and prepare datasets
data_loader = DatasetLoader(config)


Logging is configured.
Masakhane dir: /workspace/Msc-FYP/Datasets/Masakhane
FLORES-200 dir: /workspace/Msc-FYP/Datasets/FLORES-200
Experiments dir: /workspace/Msc-FYP/Datasets/Experiments
OntoNotes dir: /workspace/Msc-FYP/Datasets/OntoNotes_5.0


In [33]:
stratified_datasets = data_loader.prepare_stratified_datasets()

Searching for Masakhane datasets in: /workspace/Msc-FYP/Datasets/Masakhane/annotation_quality_corpus
Attempting to load swahili dataset from: /workspace/Msc-FYP/Datasets/Masakhane/annotation_quality_corpus/swahili.txt
Loaded 3006 samples from /workspace/Msc-FYP/Datasets/Masakhane/annotation_quality_corpus/swahili.txt
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[col].fillna(default, inplace=True)
Successfully loaded swahili dataset with 3006 samples
Attempting to load kinyarwanda dataset from: /workspace/Msc-FYP/Datasets/Masakhane/annotation_quality_corpus/kinyarwanda.txt
Loaded 3015 samples from /workspace/Msc-FYP/Datasets/Masakhane/annotation_q

In [34]:
for split, dataset in stratified_datasets.items():
    print(f"{split} dataset size: {len(dataset)}")

train dataset size: 1050
eval dataset size: 225
benchmark dataset size: 225
class_weights dataset size: 9
experimental dataset size: 2


In [35]:
# Verify data integrity
if not data_loader.verify_data_integrity(stratified_datasets):
    logger.error("Data integrity check failed. Please review the datasets.")
    sys.exit(1)


Verifying train dataset:
Dataset type: <class 'pandas.core.frame.DataFrame'>
Dataset shape: (1050, 4)
Columns: ['text', 'label', 'language', 'split']
Number of unique labels: 9
Unique labels: {'I-ORG', 'O', 'I-DATE', 'B-DATE', 'B-LOC', 'I-LOC', 'I-PER', 'B-ORG', 'B-PER'}
Text data check passed.
Languages in train dataset: ['kin' 'lug' 'swh']
Splits in train dataset: ['masakhane']
Sample data:
                                                   text  \
695   Perezida Paul Kagame aherutse kubikomozaho ati...   
1022  Kati obulwadde obulala obulabika ngobunafu bwe...   
51    Matayarisho ya uchaguzi Akiwa anajitayarisha k...   
493     Hari icyo byafasha amakipe yitegura amarushanwa   
390   Sendashonga wari umunyamuryango wa FPR Inkotan...   

                                                  label language      split  
695   O B-PER I-PER O O O O O O O O O O O O O O O O ...      kin  masakhane  
1022                          O O O O O O O O O O O O O      lug  masakhane  
51    O O O O 

In [36]:
# Print dataset information
data_loader.print_dataset_info(stratified_datasets)


--- TRAIN Dataset ---
Shape: (1050, 4)

Column Info:
<class 'pandas.core.frame.DataFrame'>
Index: 1050 entries, 695 to 912
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      1050 non-null   object
 1   label     1050 non-null   object
 2   language  1050 non-null   object
 3   split     1050 non-null   object
dtypes: object(4)
memory usage: 41.0+ KB
None

Sample Data:
                                                   text  \
695   Perezida Paul Kagame aherutse kubikomozaho ati...   
1022  Kati obulwadde obulala obulabika ngobunafu bwe...   
51    Matayarisho ya uchaguzi Akiwa anajitayarisha k...   
493     Hari icyo byafasha amakipe yitegura amarushanwa   
390   Sendashonga wari umunyamuryango wa FPR Inkotan...   

                                                  label language      split  
695   O B-PER I-PER O O O O O O O O O O O O O O O O ...      kin  masakhane  
1022                          O O O O O O O O

In [37]:
# Inspect the structure of stratified_datasets
print("Structure of stratified_datasets:")
print(data_loader.inspect_dataset_structure(stratified_datasets))

Structure of stratified_datasets:
train: DataFrame(shape=(1050, 4))
eval: DataFrame(shape=(225, 4))
benchmark: DataFrame(shape=(225, 4))
class_weights:   O: float(0.12834500106070248)
  B-PER: float(3.9158576051779934)
  I-PER: float(6.505376344086022)
  B-DATE: float(9.020234291799786)
  B-ORG: float(7.469135802469136)
  I-ORG: float(8.504016064257028)
  B-LOC: float(3.8622891016871863)
  I-DATE: float(9.317931793179318)
  I-LOC: float(14.116666666666667)
experimental:   zero_shot: DataFrame(shape=(5, 4))
  code_switch: DataFrame(shape=(5, 4))


In [38]:
# Preprocess datasets
preprocessed_datasets = {
    key: data_loader.preprocess_dataset(dataset)
    for key, dataset in stratified_datasets.items()
}

In [39]:
# Inspect the structure of preprocessed_datasets
print("\nStructure of preprocessed_datasets:")
print(data_loader.inspect_dataset_structure(preprocessed_datasets))


Structure of preprocessed_datasets:
train: DataFrame(shape=(1050, 4))
eval: DataFrame(shape=(225, 4))
benchmark: DataFrame(shape=(225, 4))
class_weights:   O: str(0.12835)
  B-PER: str(3.91586)
  I-PER: str(6.50538)
  B-DATE: str(9.02023)
  B-ORG: str(7.46914)
  I-ORG: str(8.50402)
  B-LOC: str(3.86229)
  I-DATE: str(9.31793)
  I-LOC: str(14.11667)
experimental:   zero_shot: DataFrame(shape=(5, 4))
  code_switch: DataFrame(shape=(5, 4))


In [40]:
# Check if the models directory is in the Python path
models_dir = os.path.abspath(os.path.join('..', 'py', 'models'))
if models_dir not in sys.path:
    sys.path.append(models_dir)
    print(f"Added {models_dir} to Python path")

Added /workspace/Msc-FYP/py/models to Python path


In [41]:
# Print contents of the models directory
print(f"Models directory contents: {os.listdir(models_dir)}")

Models directory contents: ['__pycache__', 'afro_xlmr_large.py', 'ernie_m.py', '.ipynb_checkpoints', 'llama2_decoder.py']


In [42]:
# Initialize models
models, tokenizers = {}, {}
num_labels = len(preprocessed_datasets['train']['label'].unique())

In [43]:
# Initialize models with the configured parameters and authentication token
for model_name in config['model']['names']:
    print(f"Initializing model: {model_name}")
    print(f"Cache directory: {cache_dir}")
    print(f"Auth token: {auth_token[:5]}...{auth_token[-5:] if auth_token else None}")
    try:
        if model_name == "meta-llama/Llama-2-7b-hf":
            llama_model = Llama2Decoder(model_name, auth_token=auth_token, cache_dir=cache_dir)
            models[model_name] = llama_model.get_model()   # The model is already on the appropriate device
            tokenizers[model_name] = llama_model.get_tokenizer()
        else:
            model, tokenizer = get_model(model_name, num_labels=num_labels, auth_token=auth_token, cache_dir=cache_dir)
            models[model_name] = model.to('cuda') # Ensure Afro XLMR is on GPU
            tokenizers[model_name] = tokenizer

        if models[model_name] is not None and tokenizers[model_name] is not None:
            print(f"Successfully initialized {model_name}")
        else:
            print(f"Failed to initialize {model_name}")
    except Exception as e:
        print(f"Error initializing {model_name}: {str(e)}")
        raise

Initializing model: afro-xlmr-large
Cache directory: /workspace/Msc-FYP/ipynb/model_cache
Auth token: hf_EF...Ulmqr


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Successfully initialized afro-xlmr-large
Initializing model: meta-llama/Llama-2-7b-hf
Cache directory: /workspace/Msc-FYP/ipynb/model_cache
Auth token: hf_EF...Ulmqr
Initializing Llama2Decoder with:
  model_name: meta-llama/Llama-2-7b-hf
  auth_token: hf_EF...Ulmqr
  cache_dir: /workspace/Msc-FYP/ipynb/model_cache
Set cache directory to: /workspace/Msc-FYP/ipynb/model_cache
Model meta-llama/Llama-2-7b-hf not found in cache. Will attempt to download.
Initializing tokenizer from meta-llama/Llama-2-7b-hf...
Tokenizer parameters:
  pretrained_model_name_or_path: meta-llama/Llama-2-7b-hf
  use_auth_token: True
  cache_dir: /workspace/Msc-FYP/ipynb/model_cache
  local_files_only: False
Initializing model from meta-llama/Llama-2-7b-hf...


We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Successfully initialized meta-llama/Llama-2-7b-hf
Model device: cuda:0
Successfully initialized meta-llama/Llama-2-7b-hf


In [44]:
print("Available models:", list(models.keys()))
print("Available tokenizers:", list(tokenizers.keys()))

Available models: ['afro-xlmr-large', 'meta-llama/Llama-2-7b-hf']
Available tokenizers: ['afro-xlmr-large', 'meta-llama/Llama-2-7b-hf']


In [45]:
# Create custom datasets for PyTorch
datasets = {}

model_type = 'encoder_decoder'  # New model type for the combined model

# For the combined Afro-XLMR and LLaMA model
combined_tokenizer = tokenizers['afro-xlmr-large']  # Assuming you're using the Afro-XLMR tokenizer for the combined model

logging.info(f"Creating datasets for combined Afro-XLMR and LLaMA model with model type: {model_type}")

# Use the key 'combined_afro_xlmr_llama'
datasets['combined_afro_xlmr_llama'] = {
    'train': CustomDataset(preprocessed_datasets['train'], combined_tokenizer, model_type=model_type),
    'eval': CustomDataset(preprocessed_datasets['eval'], combined_tokenizer, model_type=model_type),
    'benchmark': CustomDataset(preprocessed_datasets['benchmark'], combined_tokenizer, model_type=model_type)
}


Creating datasets for combined Afro-XLMR and LLaMA model with model type: encoder_decoder
Initialized CustomDataset with 1050 samples
Number of unique labels: 9
Model type: encoder_decoder
Initialized CustomDataset with 225 samples
Number of unique labels: 9
Model type: encoder_decoder
Initialized CustomDataset with 225 samples
Number of unique labels: 9
Model type: encoder_decoder


In [46]:
# Initialize classifiers
zero_shot_classifier = ZeroShotClassifier(
    encoder=models['afro-xlmr-large'],
    decoder=models['meta-llama/Llama-2-7b-hf'],
    tokenizer=combined_tokenizer
)

code_switch_classifier = CodeSwitchClassifier(
    encoder=models['afro-xlmr-large'],
    decoder=models['meta-llama/Llama-2-7b-hf'],
    tokenizer=combined_tokenizer
)


In [47]:
print(datasets)

{'combined_afro_xlmr_llama': {'train': <utils.CustomDataset object at 0x7f93689e5120>, 'eval': <utils.CustomDataset object at 0x7f93689e4d00>, 'benchmark': <utils.CustomDataset object at 0x7f8f272f3e20>}}


In [48]:
print("Available datasets:", datasets.keys())

Available datasets: dict_keys(['combined_afro_xlmr_llama'])


In [49]:
print("Keys in 'combined_afro_xlmr_llama':", datasets['combined_afro_xlmr_llama'].keys())


Keys in 'combined_afro_xlmr_llama': dict_keys(['train', 'eval', 'benchmark'])


In [50]:
print("Train dataset:", datasets['combined_afro_xlmr_llama']['train'])
print("Eval dataset:", datasets['combined_afro_xlmr_llama']['eval'])


Train dataset: <utils.CustomDataset object at 0x7f93689e5120>
Eval dataset: <utils.CustomDataset object at 0x7f93689e4d00>


In [51]:
# Initialize evaluators
evaluators = {
    'combined_afro_xlmr_llama': AfriCOMETEvaluator(
        model=None,  # AfriCOMETEvaluator doesn't use the model directly
        tokenizer=tokenizers['afro-xlmr-large']
    ),
}

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.9.5 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../root/.cache/huggingface/hub/models--masakhane--africomet-mtl/snapshots/91a7e56061446598665d569d89b762387399e3ac/checkpoints/model.ckpt`
Encoder model frozen.


In [52]:
# Run optimization for combined Afro-XLMR and LLaMA
combined_study = run_combined_optimization(
    models['afro-xlmr-large'],  # Encoder
    models['meta-llama/Llama-2-7b-hf'],  # Decoder
    tokenizers['afro-xlmr-large'], # encoder_tokenizer
    tokenizers['meta-llama/Llama-2-7b-hf'], # decoder_tokenizer
    datasets['combined_afro_xlmr_llama'],  
    config,
    evaluators['combined_afro_xlmr_llama'] 
)

Starting hyperparameter optimization for combined Afro-XLMR and LLaMA (including intent and slot tasks)
Before optimization - GPU Memory (GB): Total: 85.06, Allocated: 17.98, Reserved: 17.99, Available: 49.09
PyTorch version: 2.1.0+cu118
[I 2024-09-23 01:34:34,996] A new study created in memory with name: no-name-bf6249f9-1340-487c-b480-42da0f853b41
Cleared GPU memory. Current allocated: 17.98 GB
Trial 0: Starting with hyperparameters: {'encoder_lr': 1.584334793720517e-06, 'decoder_lr': 3.609547702666522e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'weight_decay': 0.03126343079509253, 'warmup_steps': 475, 'gradient_accumulation_steps': 128, 'fp16': False, 'evaluation_strategy': 'steps', 'eval_steps': 200, 'save_steps': 200, 'logging_steps': 50, 'max_grad_norm': 1.0, 'output_dir': './model_outputs', 'seed': 42, 'device': 'cuda', 'cache_dir': './model_cache', 'gradient_checkpointing': True, 'intent_loss_weight': 0.4915261425253563, 'slo

Starting training process
CUDA Memory (Start of training) - Allocated: 15.74GB (Max: 17.22GB), Reserved: 17.23GB (Max: 17.23GB)
Total training steps: 66


Updating projection layer: 5120 -> 4897
Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting tr

CUDA Memory (End of training) - Allocated: 29.55GB (Max: 34.48GB), Reserved: 32.01GB (Max: 37.04GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 31.73 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 31.74 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 31.73 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 31.74 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 14.531547
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 47, Predicted Intent: 47
  True Slots: [58]
  Predicted Slots: [58]
Example 2:
  True Intent: 47, Predicted Intent: 47

Evaluating translation results with Africomet
AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:04<00:00,  6.93it/s]
Epoch 1/2 Evaluation results: {'translation_score': 0.11248937243863316, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 14.531546592712402}
Starting epoch 2/2
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training dtype: torch.bfloat16
GPU mem

AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 221
Score statistics: Min: 0.0005, Max: 0.5359, Median: 0.1109, Average: 0.1125
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 27.46GB (Max: 34.48GB), Reserved: 30.83GB (Max: 37.04GB)
Total training steps: 66


Updating projection layer: 5120 -> 4897
Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting tr

CUDA Memory (End of training) - Allocated: 27.67GB (Max: 34.48GB), Reserved: 30.83GB (Max: 37.04GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 29.72 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 29.72 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 29.72 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 29.72 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 15.160246
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 47, Predicted Intent: 47
  True Slots: [58]
  Predicted Slots: [58]
Example 2:
  True Intent: 47, Predicted Intent: 47

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:04<00:00,  7.21it/s]
Epoch 2/2 Evaluation results: {'translation_score': 0.11505203403962347, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 15.160245895385742}
Early stopping patience: 1/3
Loaded best model state


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 221
Score statistics: Min: 0.0004, Max: 0.2976, Median: 0.1144, Average: 0.1151
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results


Cleared GPU memory. Current allocated: 29.20 GB
[I 2024-09-23 01:56:24,867] Trial 0 finished with value: -0.8849479659603765 and parameters: {'encoder_lr': 1.584334793720517e-06, 'decoder_lr': 3.609547702666522e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 32, 'weight_decay': 0.03126343079509253, 'warmup_steps': 475, 'gradient_accumulation_steps': 128, 'intent_loss_weight': 0.4915261425253563, 'slot_loss_weight': 0.8792820384180697}. Best is trial 0 with value: -0.8849479659603765.
Cleared GPU memory. Current allocated: 29.20 GB
Trial 1: Starting with hyperparameters: {'encoder_lr': 2.7540994896652652e-05, 'decoder_lr': 8.396947822021402e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 16, 'weight_decay': 0.012576227491772597, 'warmup_steps': 117, 'gradient_accumulation_steps': 8, 'fp16': False, 'evaluation_strategy': 'steps', 'eval_steps': 200, 'save_steps': 200, 'logging_steps': 50, 'max_grad_norm': 1.0, 'output_dir': './model_

Starting training process
CUDA Memory (Start of training) - Allocated: 27.23GB (Max: 34.48GB), Reserved: 30.74GB (Max: 37.04GB)
Total training steps: 132


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 54.64GB (Max: 68.52GB), Reserved: 59.24GB (Max: 69.34GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.67 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.67 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.67 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.67 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 15.182608
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 36, Predicted Intent: 36
  True Slots: [55]
  Predicted Slots: [55]
Example 2:
  True Intent: 36, Predicted Intent: 36

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.04it/s]
Epoch 1/2 Evaluation results: {'translation_score': 0.10912466122520185, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 15.182607650756836}
Starting epoch 2/2
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training dtype: torch.bfloat16
GPU mem

AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 221
Score statistics: Min: 0.0050, Max: 0.4240, Median: 0.1111, Average: 0.1091
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 54.65GB (Max: 68.52GB), Reserved: 59.24GB (Max: 69.34GB)
Total training steps: 132


Updating projection layer: 5120 -> 4897
Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting tr

CUDA Memory (End of training) - Allocated: 54.86GB (Max: 68.75GB), Reserved: 59.12GB (Max: 69.34GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.91 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.92 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.91 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.92 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 14.975476
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 36, Predicted Intent: 36
  True Slots: [55]
  Predicted Slots: [55]
Example 2:
  True Intent: 36, Predicted Intent: 36

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.31it/s]
TOKENIZERS_PARALLELISM=(true | false)
Epoch 2/2 Evaluation results: {'translation_score': 0.1179577163280488, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 14.975476264953613}
Loaded best model state


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 216
Score statistics: Min: 0.0021, Max: 0.4339, Median: 0.1119, Average: 0.1180
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results


Cleared GPU memory. Current allocated: 29.20 GB
[I 2024-09-23 02:19:23,841] Trial 1 finished with value: -0.8820422836719513 and parameters: {'encoder_lr': 2.7540994896652652e-05, 'decoder_lr': 8.396947822021402e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'weight_decay': 0.012576227491772597, 'warmup_steps': 117, 'gradient_accumulation_steps': 8, 'intent_loss_weight': 0.7740213672953477, 'slot_loss_weight': 0.832290200442238}. Best is trial 0 with value: -0.8849479659603765.
Cleared GPU memory. Current allocated: 29.20 GB
Trial 2: Starting with hyperparameters: {'encoder_lr': 1.3220780379824521e-05, 'decoder_lr': 7.804805185706499e-06, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'weight_decay': 0.08951796334203876, 'warmup_steps': 430, 'gradient_accumulation_steps': 4, 'fp16': False, 'evaluation_strategy': 'steps', 'eval_steps': 200, 'save_steps': 200, 'logging_steps': 50, 'max_grad_norm': 1.0, 'output_dir': './model_outp

Starting training process
CUDA Memory (Start of training) - Allocated: 27.23GB (Max: 68.75GB), Reserved: 29.81GB (Max: 69.34GB)
Total training steps: 396


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 41.04GB (Max: 68.75GB), Reserved: 42.55GB (Max: 69.62GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 44.08 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 44.08 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 44.08 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 44.08 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 14.810886
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 43, Predicted Intent: 43
  True Slots: [35]
  Predicted Slots: [35]
Example 2:
  True Intent: 43, Predicted Intent: 43

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  7.34it/s]
Epoch 1/3 Evaluation results: {'translation_score': 0.1099138421891898, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 14.81088638305664}
Starting epoch 2/3
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training dtype: torch.bfloat16
GPU memor

AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 218
Score statistics: Min: 0.0007, Max: 0.4967, Median: 0.1096, Average: 0.1099
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 41.06GB (Max: 68.75GB), Reserved: 42.55GB (Max: 69.62GB)
Total training steps: 396


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 41.27GB (Max: 68.75GB), Reserved: 42.63GB (Max: 69.93GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 44.32 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 44.32 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 44.32 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 44.32 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 14.793429
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 43, Predicted Intent: 43
  True Slots: [35]
  Predicted Slots: [35]
Example 2:
  True Intent: 43, Predicted Intent: 43

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.16it/s]
Epoch 2/3 Evaluation results: {'translation_score': 0.11561169816376683, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 14.793429374694824}
Starting epoch 3/3
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training dtype: torch.bfloat16
GPU mem

AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 219
Score statistics: Min: 0.0039, Max: 0.4742, Median: 0.1148, Average: 0.1156
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 41.29GB (Max: 68.75GB), Reserved: 42.63GB (Max: 69.93GB)
Total training steps: 396


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 41.50GB (Max: 68.75GB), Reserved: 42.73GB (Max: 70.19GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 44.56 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 44.56 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 44.56 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 44.56 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 14.471477
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 43, Predicted Intent: 43
  True Slots: [35]
  Predicted Slots: [35]
Example 2:
  True Intent: 43, Predicted Intent: 43

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.11it/s]
TOKENIZERS_PARALLELISM=(true | false)
Epoch 3/3 Evaluation results: {'translation_score': 0.11515461621865024, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 14.471476554870605}
Loaded best model state


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 221
Score statistics: Min: 0.0088, Max: 0.2896, Median: 0.1160, Average: 0.1152
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results


Cleared GPU memory. Current allocated: 14.60 GB
[I 2024-09-23 03:01:42,368] Trial 2 finished with value: -0.8848453837813498 and parameters: {'encoder_lr': 1.3220780379824521e-05, 'decoder_lr': 7.804805185706499e-06, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'weight_decay': 0.08951796334203876, 'warmup_steps': 430, 'gradient_accumulation_steps': 4, 'intent_loss_weight': 0.9716137636933955, 'slot_loss_weight': 0.43379227621507477}. Best is trial 0 with value: -0.8849479659603765.
Cleared GPU memory. Current allocated: 14.60 GB
Delay after trial 1: 0.09 seconds
Trial 1 completed in 5227.37 seconds
After trial 1 - GPU Memory (GB): Total: 85.06, Allocated: 14.60, Reserved: 16.38, Available: 54.08
Trial 1 results:
  Translation score: -0.8848
  Intent accuracy: 1.0000
  Slot F1 score: 1.0000
Cleared GPU memory. Current allocated: 14.60 GB
Trial 3: Starting with hyperparameters: {'encoder_lr': 1.2271216456820661e-05, 'decoder_lr': 6.572739527520792e-06, 'num_train_epochs': 3, 

Starting training process
CUDA Memory (Start of training) - Allocated: 13.64GB (Max: 68.75GB), Reserved: 15.25GB (Max: 70.19GB)
Total training steps: 1575


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 54.64GB (Max: 68.75GB), Reserved: 56.58GB (Max: 70.19GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.67 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.67 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.67 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.67 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 13.004158
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 38, Predicted Intent: 38
  True Slots: [58]
  Predicted Slots: [58]
Example 2:
  True Intent: 38, Predicted Intent: 38

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.18it/s]
Epoch 1/3 Evaluation results: {'translation_score': 0.11056665698733097, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 13.004158020019531}
Starting epoch 2/3
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training dtype: torch.bfloat16
GPU mem

AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 220
Score statistics: Min: 0.0018, Max: 0.2937, Median: 0.1134, Average: 0.1106
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 54.65GB (Max: 68.75GB), Reserved: 56.58GB (Max: 70.19GB)
Total training steps: 1575


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 54.86GB (Max: 68.75GB), Reserved: 56.57GB (Max: 70.19GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.91 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.92 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.91 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.92 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 11.135643
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 38, Predicted Intent: 38
  True Slots: [58]
  Predicted Slots: [58]
Example 2:
  True Intent: 38, Predicted Intent: 38

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.08it/s]
Epoch 2/3 Evaluation results: {'translation_score': 0.12095317937895109, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 11.135643005371094}
Starting epoch 3/3
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training dtype: torch.bfloat16
GPU mem

AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 218
Score statistics: Min: 0.0023, Max: 0.4942, Median: 0.1178, Average: 0.1210
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 54.88GB (Max: 68.75GB), Reserved: 56.57GB (Max: 70.19GB)
Total training steps: 1575


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 55.09GB (Max: 68.75GB), Reserved: 56.57GB (Max: 70.19GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 59.16 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 59.16 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 59.16 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 59.16 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 11.344810
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 38, Predicted Intent: 38
  True Slots: [57]
  Predicted Slots: [57]
Example 2:
  True Intent: 38, Predicted Intent: 38

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.19it/s]
Epoch 3/3 Evaluation results: {'translation_score': 0.12430255515628481, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 11.344809532165527}
Early stopping patience: 1/3
Loaded best model state


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 219
Score statistics: Min: 0.0048, Max: 0.3294, Median: 0.1201, Average: 0.1243
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results


Cleared GPU memory. Current allocated: 29.20 GB
[I 2024-09-23 04:13:07,909] Trial 3 finished with value: -0.8756974448437151 and parameters: {'encoder_lr': 1.2271216456820661e-05, 'decoder_lr': 6.572739527520792e-06, 'num_train_epochs': 3, 'per_device_train_batch_size': 2, 'weight_decay': 0.06350094246672124, 'warmup_steps': 364, 'gradient_accumulation_steps': 4, 'intent_loss_weight': 0.34073433526683883, 'slot_loss_weight': 0.1923919090222901}. Best is trial 0 with value: -0.8849479659603765.
Cleared GPU memory. Current allocated: 29.20 GB
Delay after trial 2: 0.11 seconds
Trial 2 completed in 4285.44 seconds
After trial 2 - GPU Memory (GB): Total: 85.06, Allocated: 29.20, Reserved: 31.30, Available: 24.55
Trial 2 results:
  Translation score: -0.8757
  Intent accuracy: 1.0000
  Slot F1 score: 1.0000
Cleared GPU memory. Current allocated: 29.20 GB
Trial 4: Starting with hyperparameters: {'encoder_lr': 5.340901774718344e-05, 'decoder_lr': 9.720656845589094e-06, 'num_train_epochs': 3, '

Starting training process
CUDA Memory (Start of training) - Allocated: 27.23GB (Max: 68.75GB), Reserved: 29.15GB (Max: 70.19GB)
Total training steps: 99


Starting train_step for batch 1
Updating projection layer: 5120 -> 4897
Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting tra

CUDA Memory (End of training) - Allocated: 54.65GB (Max: 69.85GB), Reserved: 57.09GB (Max: 70.97GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 11.882577
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 34, Predicted Intent: 34
  True Slots: [82]
  Predicted Slots: [82]
Example 2:
  True Intent: 34, Predicted Intent: 34

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.09it/s]
TOKENIZERS_PARALLELISM=(true | false)
Epoch 1/3 Evaluation results: {'translation_score': 0.12250518514487688, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 11.882576942443848}
Starting epoch 2/3
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 219
Score statistics: Min: 0.0015, Max: 0.3852, Median: 0.1182, Average: 0.1225
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 54.66GB (Max: 69.85GB), Reserved: 57.10GB (Max: 70.97GB)
Total training steps: 99


Updating projection layer: 5120 -> 4897
Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting tr

CUDA Memory (End of training) - Allocated: 54.88GB (Max: 70.09GB), Reserved: 57.10GB (Max: 70.97GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.93 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.94 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.93 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.94 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 11.430446
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 34, Predicted Intent: 34
  True Slots: [82]
  Predicted Slots: [82]
Example 2:
  True Intent: 34, Predicted Intent: 34

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  7.38it/s]
Epoch 2/3 Evaluation results: {'translation_score': 0.11602186480730907, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 11.430445671081543}
Starting epoch 3/3
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training dtype: torch.bfloat16
GPU mem

AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 221
Score statistics: Min: 0.0082, Max: 0.3916, Median: 0.1126, Average: 0.1160
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 54.90GB (Max: 70.09GB), Reserved: 57.10GB (Max: 70.97GB)
Total training steps: 99


Updating projection layer: 5120 -> 4897
Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting tr

CUDA Memory (End of training) - Allocated: 55.11GB (Max: 70.33GB), Reserved: 57.04GB (Max: 71.24GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 59.18 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 59.18 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 59.18 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 59.18 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 11.927847
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 34, Predicted Intent: 34
  True Slots: [82]
  Predicted Slots: [82]
Example 2:
  True Intent: 34, Predicted Intent: 34

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.14it/s]
Epoch 3/3 Evaluation results: {'translation_score': 0.12003916883799015, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 11.927846908569336}
Early stopping patience: 1/3
Loaded best model state


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 221
Score statistics: Min: 0.0102, Max: 0.2541, Median: 0.1161, Average: 0.1200
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results


Cleared GPU memory. Current allocated: 29.22 GB
[I 2024-09-23 04:45:16,160] Trial 4 finished with value: -0.8799608311620098 and parameters: {'encoder_lr': 5.340901774718344e-05, 'decoder_lr': 9.720656845589094e-06, 'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'weight_decay': 0.013849567247167247, 'warmup_steps': 175, 'gradient_accumulation_steps': 32, 'intent_loss_weight': 0.5289296755578761, 'slot_loss_weight': 0.6355026326449162}. Best is trial 0 with value: -0.8849479659603765.
Cleared GPU memory. Current allocated: 29.22 GB
Trial 5: Starting with hyperparameters: {'encoder_lr': 5.5751951468132376e-06, 'decoder_lr': 1.0655500347034755e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'weight_decay': 0.03857847809160532, 'warmup_steps': 297, 'gradient_accumulation_steps': 8, 'fp16': False, 'evaluation_strategy': 'steps', 'eval_steps': 200, 'save_steps': 200, 'logging_steps': 50, 'max_grad_norm': 1.0, 'output_dir': './model_ou

Starting training process
CUDA Memory (Start of training) - Allocated: 27.25GB (Max: 70.33GB), Reserved: 30.19GB (Max: 71.24GB)
Total training steps: 264


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 54.66GB (Max: 70.33GB), Reserved: 55.69GB (Max: 71.24GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.70 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.70 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.70 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.70 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 12.402215
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 17, Predicted Intent: 17
  True Slots: [67]
  Predicted Slots: [67]
Example 2:
  True Intent: 17, Predicted Intent: 17

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  7.65it/s]
Epoch 1/2 Evaluation results: {'translation_score': 0.12681887674128312, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 12.402215003967285}
Starting epoch 2/2
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training dtype: torch.bfloat16
GPU mem

AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 218
Score statistics: Min: 0.0030, Max: 0.5103, Median: 0.1246, Average: 0.1268
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 54.67GB (Max: 70.33GB), Reserved: 55.69GB (Max: 71.24GB)
Total training steps: 264


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 54.88GB (Max: 70.33GB), Reserved: 55.76GB (Max: 71.24GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.94 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.94 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.94 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.94 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 12.482211
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 17, Predicted Intent: 17
  True Slots: [8]
  Predicted Slots: [8]
Example 2:
  True Intent: 17, Predicted Intent: 17
 

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.13it/s]
Epoch 2/2 Evaluation results: {'translation_score': 0.12312586307227952, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 12.482211112976074}
Early stopping patience: 1/3
Loaded best model state


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 219
Score statistics: Min: 0.0009, Max: 0.5144, Median: 0.1225, Average: 0.1231
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results


Cleared GPU memory. Current allocated: 29.21 GB
[I 2024-09-23 05:08:36,318] Trial 5 finished with value: -0.8768741369277204 and parameters: {'encoder_lr': 5.5751951468132376e-06, 'decoder_lr': 1.0655500347034755e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'weight_decay': 0.03857847809160532, 'warmup_steps': 297, 'gradient_accumulation_steps': 8, 'intent_loss_weight': 0.8542567056492404, 'slot_loss_weight': 0.8183935438359984}. Best is trial 0 with value: -0.8849479659603765.
Cleared GPU memory. Current allocated: 29.21 GB
Trial 6: Starting with hyperparameters: {'encoder_lr': 1.0620039101098075e-05, 'decoder_lr': 6.409771301580595e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'weight_decay': 0.026185644643783743, 'warmup_steps': 379, 'gradient_accumulation_steps': 8, 'fp16': False, 'evaluation_strategy': 'steps', 'eval_steps': 200, 'save_steps': 200, 'logging_steps': 50, 'max_grad_norm': 1.0, 'output_dir': './model_o

Starting training process
CUDA Memory (Start of training) - Allocated: 27.24GB (Max: 70.33GB), Reserved: 30.85GB (Max: 71.24GB)
Total training steps: 33


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 54.68GB (Max: 70.33GB), Reserved: 57.16GB (Max: 71.27GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.72 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.72 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.72 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.72 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 11.762144
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 27, Predicted Intent: 27
  True Slots: [34]
  Predicted Slots: [34]
Example 2:
  True Intent: 27, Predicted Intent: 27

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.09it/s]
Epoch 1/1 Evaluation results: {'translation_score': 0.12088168234572755, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 11.762144088745117}
Loaded best model state


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 222
Score statistics: Min: 0.0045, Max: 0.6466, Median: 0.1206, Average: 0.1209
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results


Cleared GPU memory. Current allocated: 29.23 GB
[I 2024-09-23 05:17:36,439] Trial 6 finished with value: -0.8791183176542725 and parameters: {'encoder_lr': 1.0620039101098075e-05, 'decoder_lr': 6.409771301580595e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 32, 'weight_decay': 0.026185644643783743, 'warmup_steps': 379, 'gradient_accumulation_steps': 8, 'intent_loss_weight': 0.8371677057410659, 'slot_loss_weight': 0.6774568835061096}. Best is trial 0 with value: -0.8849479659603765.
Cleared GPU memory. Current allocated: 29.23 GB
Delay after trial 3: 0.12 seconds
Trial 3 completed in 3868.42 seconds
After trial 3 - GPU Memory (GB): Total: 85.06, Allocated: 29.23, Reserved: 34.31, Available: 21.52
Trial 3 results:
  Translation score: -0.8791
  Intent accuracy: 1.0000
  Slot F1 score: 1.0000
Cleared GPU memory. Current allocated: 29.23 GB
Trial 7: Starting with hyperparameters: {'encoder_lr': 5.95928012590099e-05, 'decoder_lr': 2.890273847476356e-06, 'num_train_epochs': 1, '

Starting training process
CUDA Memory (Start of training) - Allocated: 27.26GB (Max: 70.33GB), Reserved: 31.95GB (Max: 71.27GB)
Total training steps: 66


Updating projection layer: 5120 -> 4897
Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting tr

CUDA Memory (End of training) - Allocated: 54.67GB (Max: 70.33GB), Reserved: 56.20GB (Max: 71.27GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.71 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.71 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.71 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.71 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 11.112671
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 36, Predicted Intent: 36
  True Slots: [50]
  Predicted Slots: [50]
Example 2:
  True Intent: 36, Predicted Intent: 36

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.13it/s]
Epoch 1/1 Evaluation results: {'translation_score': 0.11840678740788353, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 11.1126708984375}
Loaded best model state


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 218
Score statistics: Min: 0.0039, Max: 0.2487, Median: 0.1175, Average: 0.1184
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results


Cleared GPU memory. Current allocated: 29.22 GB
[I 2024-09-23 05:27:48,796] Trial 7 finished with value: -0.8815932125921164 and parameters: {'encoder_lr': 5.95928012590099e-05, 'decoder_lr': 2.890273847476356e-06, 'num_train_epochs': 1, 'per_device_train_batch_size': 16, 'weight_decay': 0.036932736284978056, 'warmup_steps': 269, 'gradient_accumulation_steps': 64, 'intent_loss_weight': 0.6938748227654997, 'slot_loss_weight': 0.9377115967447781}. Best is trial 0 with value: -0.8849479659603765.
Cleared GPU memory. Current allocated: 29.22 GB
Trial 8: Starting with hyperparameters: {'encoder_lr': 3.481496259559779e-05, 'decoder_lr': 1.122780597878627e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'per_device_eval_batch_size': 8, 'weight_decay': 0.011692717320829804, 'warmup_steps': 350, 'gradient_accumulation_steps': 2, 'fp16': False, 'evaluation_strategy': 'steps', 'eval_steps': 200, 'save_steps': 200, 'logging_steps': 50, 'max_grad_norm': 1.0, 'output_dir': './model_outp

Starting training process
CUDA Memory (Start of training) - Allocated: 27.25GB (Max: 70.33GB), Reserved: 31.92GB (Max: 71.27GB)
Total training steps: 396


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 41.05GB (Max: 70.33GB), Reserved: 42.42GB (Max: 71.27GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 44.08 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 44.09 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 44.08 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 44.09 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 10.040504
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 29, Predicted Intent: 29
  True Slots: [91]
  Predicted Slots: [91]
Example 2:
  True Intent: 29, Predicted Intent: 29

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.08it/s]
Epoch 1/3 Evaluation results: {'translation_score': 0.11894268495855709, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 10.04050350189209}
Starting epoch 2/3
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training dtype: torch.bfloat16
GPU memo

AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 219
Score statistics: Min: 0.0023, Max: 0.3311, Median: 0.1161, Average: 0.1189
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 41.07GB (Max: 70.33GB), Reserved: 42.43GB (Max: 71.27GB)
Total training steps: 396


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 41.28GB (Max: 70.33GB), Reserved: 42.59GB (Max: 71.27GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 44.33 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 44.33 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 44.33 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 44.33 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 11.314167
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 29, Predicted Intent: 29
  True Slots: [91]
  Predicted Slots: [91]
Example 2:
  True Intent: 29, Predicted Intent: 29

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.16it/s]
Epoch 2/3 Evaluation results: {'translation_score': 0.11792881634001705, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 11.314167022705078}
Early stopping patience: 1/3
Starting epoch 3/3
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training 

AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 220
Score statistics: Min: 0.0032, Max: 0.4501, Median: 0.1231, Average: 0.1179
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 41.29GB (Max: 70.33GB), Reserved: 42.50GB (Max: 71.27GB)
Total training steps: 396


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 41.50GB (Max: 70.33GB), Reserved: 42.80GB (Max: 71.27GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 44.57 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 44.57 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 44.57 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 44.57 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 9.495889
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 29, Predicted Intent: 29
  True Slots: [91]
  Predicted Slots: [91]
Example 2:
  True Intent: 29, Predicted Intent: 29


Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.21it/s]
Epoch 3/3 Evaluation results: {'translation_score': 0.1220100999679752, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 9.495888710021973}
Loaded best model state


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 222
Score statistics: Min: 0.0069, Max: 0.4359, Median: 0.1267, Average: 0.1220
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results


Cleared GPU memory. Current allocated: 14.60 GB
[I 2024-09-23 06:07:55,212] Trial 8 finished with value: -0.8779899000320248 and parameters: {'encoder_lr': 3.481496259559779e-05, 'decoder_lr': 1.122780597878627e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'weight_decay': 0.011692717320829804, 'warmup_steps': 350, 'gradient_accumulation_steps': 2, 'intent_loss_weight': 0.21037072396085482, 'slot_loss_weight': 0.3650443741137871}. Best is trial 0 with value: -0.8849479659603765.
Cleared GPU memory. Current allocated: 14.60 GB
Trial 9: Starting with hyperparameters: {'encoder_lr': 2.713226672082772e-06, 'decoder_lr': 7.029251555873959e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 2, 'weight_decay': 0.05314240785855675, 'warmup_steps': 256, 'gradient_accumulation_steps': 8, 'fp16': False, 'evaluation_strategy': 'steps', 'eval_steps': 200, 'save_steps': 200, 'logging_steps': 50, 'max_grad_norm': 1.0, 'output_dir': './model_outpu

Starting training process
CUDA Memory (Start of training) - Allocated: 13.64GB (Max: 70.33GB), Reserved: 15.25GB (Max: 71.27GB)
Total training steps: 525


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 54.64GB (Max: 70.33GB), Reserved: 56.31GB (Max: 71.27GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.67 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.67 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 9.999743
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 49, Predicted Intent: 49
  True Slots: [63]
  Predicted Slots: [63]
Example 2:
  True Intent: 49, Predicted Intent: 49


Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  7.91it/s]
Epoch 1/1 Evaluation results: {'translation_score': 0.11835029269967161, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 9.99974250793457}
Loaded best model state


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 222
Score statistics: Min: 0.0025, Max: 0.5097, Median: 0.1167, Average: 0.1184
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results


Cleared GPU memory. Current allocated: 29.20 GB
[I 2024-09-23 06:21:45,346] Trial 9 finished with value: -0.8816497073003284 and parameters: {'encoder_lr': 2.713226672082772e-06, 'decoder_lr': 7.029251555873959e-05, 'num_train_epochs': 1, 'per_device_train_batch_size': 2, 'weight_decay': 0.05314240785855675, 'warmup_steps': 256, 'gradient_accumulation_steps': 8, 'intent_loss_weight': 0.880005622356883, 'slot_loss_weight': 0.13543329629075776}. Best is trial 0 with value: -0.8849479659603765.
Cleared GPU memory. Current allocated: 29.20 GB
Delay after trial 4: 0.10 seconds
Trial 4 completed in 3848.78 seconds
After trial 4 - GPU Memory (GB): Total: 85.06, Allocated: 29.20, Reserved: 31.67, Available: 24.19
Trial 4 results:
  Translation score: -0.8816
  Intent accuracy: 1.0000
  Slot F1 score: 1.0000
Cleared GPU memory. Current allocated: 29.20 GB
Trial 10: Starting with hyperparameters: {'encoder_lr': 1.0385188433766581e-06, 'decoder_lr': 1.4344219528553843e-06, 'num_train_epochs': 2, 

Starting training process
CUDA Memory (Start of training) - Allocated: 27.23GB (Max: 70.33GB), Reserved: 29.49GB (Max: 71.27GB)
Total training steps: 526


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 54.64GB (Max: 70.33GB), Reserved: 55.13GB (Max: 71.27GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 9.783488
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 42, Predicted Intent: 42
  True Slots: [56]
  Predicted Slots: [56]
Example 2:
  True Intent: 42, Predicted Intent: 42


Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  7.92it/s]
Epoch 1/2 Evaluation results: {'translation_score': 0.11734718817172411, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 9.783488273620605}
Starting epoch 2/2
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training dtype: torch.bfloat16
GPU memo

AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 221
Score statistics: Min: 0.0081, Max: 0.2867, Median: 0.1179, Average: 0.1173
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 54.66GB (Max: 70.33GB), Reserved: 55.14GB (Max: 71.27GB)
Total training steps: 526


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 54.87GB (Max: 70.33GB), Reserved: 55.25GB (Max: 71.27GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.92 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.92 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.92 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.92 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 10.140888
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 42, Predicted Intent: 42
  True Slots: [56]
  Predicted Slots: [56]
Example 2:
  True Intent: 42, Predicted Intent: 42

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.08it/s]
TOKENIZERS_PARALLELISM=(true | false)
Epoch 2/2 Evaluation results: {'translation_score': 0.1188260601286631, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 10.140888214111328}
Early stopping patience: 1/3
Loaded best model state


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 218
Score statistics: Min: 0.0094, Max: 0.3331, Median: 0.1174, Average: 0.1188
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results


Cleared GPU memory. Current allocated: 29.20 GB
[I 2024-09-23 06:49:32,084] Trial 10 finished with value: -0.8811739398713369 and parameters: {'encoder_lr': 1.0385188433766581e-06, 'decoder_lr': 1.4344219528553843e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 4, 'weight_decay': 0.02053691187630526, 'warmup_steps': 486, 'gradient_accumulation_steps': 128, 'intent_loss_weight': 0.5084874460068664, 'slot_loss_weight': 0.9904333083270163}. Best is trial 0 with value: -0.8849479659603765.
Cleared GPU memory. Current allocated: 29.20 GB
Trial 11: Starting with hyperparameters: {'encoder_lr': 2.994471116556308e-06, 'decoder_lr': 3.3477388089161037e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 1, 'per_device_eval_batch_size': 1, 'weight_decay': 0.09304895986444242, 'warmup_steps': 487, 'gradient_accumulation_steps': 128, 'fp16': False, 'evaluation_strategy': 'steps', 'eval_steps': 200, 'save_steps': 200, 'logging_steps': 50, 'max_grad_norm': 1.0, 'output_dir': './mode

Starting training process
CUDA Memory (Start of training) - Allocated: 27.23GB (Max: 70.33GB), Reserved: 29.67GB (Max: 71.27GB)
Total training steps: 2100


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 54.64GB (Max: 70.33GB), Reserved: 54.72GB (Max: 71.27GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.68 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 10.706799
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 40, Predicted Intent: 40
  True Slots: [76]
  Predicted Slots: [76]
Example 2:
  True Intent: 40, Predicted Intent: 40

Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.01it/s]
Epoch 1/2 Evaluation results: {'translation_score': 0.11673181392920659, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 10.706798553466797}
Starting epoch 2/2
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training dtype: torch.bfloat16
GPU mem

AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 223
Score statistics: Min: 0.0054, Max: 0.2625, Median: 0.1112, Average: 0.1167
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results
Starting training process
CUDA Memory (Start of training) - Allocated: 54.66GB (Max: 70.33GB), Reserved: 54.80GB (Max: 71.27GB)
Total training steps: 2100


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 54.87GB (Max: 70.33GB), Reserved: 55.21GB (Max: 71.27GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 58.92 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 58.92 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 58.92 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 58.92 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 9.797112
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 40, Predicted Intent: 40
  True Slots: [90]
  Predicted Slots: [90]
Example 2:
  True Intent: 40, Predicted Intent: 40


Evaluating translation results with Africomet


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


AfriCOMET model moved to GPU.
Number of texts to evaluate: 225
Starting AfriCOMET prediction


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Predicting DataLoader 0: 100%|██████████| 29/29 [00:03<00:00,  8.03it/s]
Epoch 2/2 Evaluation results: {'translation_score': 0.10894583156399215, 'intent_accuracy': 1.0, 'slot_f1': 1.0, 'eval_loss': 9.797112464904785}
Loaded best model state


AfriCOMET prediction completed
Type of model_output: <class 'comet.models.utils.Prediction'>
Number of valid scores: 209
Score statistics: Min: 0.0007, Max: 0.3218, Median: 0.1043, Average: 0.1089
CUDA memory cleared and AfriCOMET model moved to CPU.
Complete evaluating translation results


Cleared GPU memory. Current allocated: 29.20 GB
[I 2024-09-23 07:44:20,252] Trial 11 finished with value: -0.8910541684360078 and parameters: {'encoder_lr': 2.994471116556308e-06, 'decoder_lr': 3.3477388089161037e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 1, 'weight_decay': 0.09304895986444242, 'warmup_steps': 487, 'gradient_accumulation_steps': 128, 'intent_loss_weight': 0.9761991671313265, 'slot_loss_weight': 0.43081143618503664}. Best is trial 11 with value: -0.8910541684360078.
Trial 5 completed in 4954.80 seconds
After trial 5 - GPU Memory (GB): Total: 85.06, Allocated: 29.20, Reserved: 32.14, Available: 23.72
Trial 5 results:
  Translation score: -0.8911
  Intent accuracy: 1.0000
  Slot F1 score: 1.0000
Best trial results:
  Translation score: -0.8911
  Intent accuracy: 1.0000
  Slot F1 score: 1.0000
Average delay between trials: 0.11 seconds
Cleared GPU memory. Current allocated: 29.20 GB


In [55]:
print(f"Studies: {studies.items()}")

dict_items([('combined_study', <optuna.study.study.Study object at 0x7f8f262249d0>)])


In [53]:
# Run hyperparameter optimization
try:
    # Consolidate the results into the studies variable
    studies = {
        'combined_study': combined_study,
    }
    # Extract best parameters
    best_params = {model_name: study.best_params for model_name, study in studies.items()}
    print(best_params)

    # Log best parameters
    for model_name, params in best_params.items():
        logger.info(f"Best hyperparameters for {model_name}: {params}")
        config['training'][model_name] = params

except Exception as e:
    logger.error(f"Error during or after hyperparameter optimization: {str(e)}")
    logger.exception("Exception details:")
    studies = {}  # Initialize an empty dictionary if optimization failed

Best hyperparameters for combined_study: {'encoder_lr': 2.994471116556308e-06, 'decoder_lr': 3.3477388089161037e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 1, 'weight_decay': 0.09304895986444242, 'warmup_steps': 487, 'gradient_accumulation_steps': 128, 'intent_loss_weight': 0.9761991671313265, 'slot_loss_weight': 0.43081143618503664}


In [54]:
# Hyperparameter analysis
logger.info("Performing hyperparameter analysis...")

Performing hyperparameter analysis...


In [68]:
import plotly.graph_objects as go
import numpy as np

In [64]:
for model_name, study in studies.items():
    # Plot hyperparameter importance
    importance_fig = plot_hyperparameter_importance(study)
    importance_fig.write_image(f"{config['model']['output_dir']}/{model_name}_hyperparameter_importance.png")

    # Plot optimization history
    history_fig = optuna.visualization.plot_optimization_history(study)
    history_fig.update_layout(title="Optimization History")
    history_fig.write_image(f"{config['model']['output_dir']}/{model_name}_optimization_history.png")

    # Plot parallel coordinate
    dimensions = []
    for param in study.best_trials[0].params:
        values = [trial.params[param] for trial in study.trials if param in trial.params]
        if isinstance(values[0], (int, float)):
            dimensions.append(
                dict(range = [min(values), max(values)],
                     label = param,
                     values = values)
            )
        elif isinstance(values[0], str):
            # For categorical parameters, we need a different approach
            unique_values = list(set(values))
            dimensions.append(
                dict(range = [0, len(unique_values) - 1],
                     tickvals = list(range(len(unique_values))),
                     ticktext = unique_values,
                     label = param,
                     values = [unique_values.index(v) for v in values])
            )

    parallel_fig = go.Figure(data=
        go.Parcoords(
            line = dict(color = [trial.value for trial in study.trials],
                        colorscale = 'Viridis',
                        showscale = True),
            dimensions = dimensions
        )
    )
    parallel_fig.update_layout(title="Parallel Coordinate Plot of Hyperparameters")
    parallel_fig.write_image(f"{config['model']['output_dir']}/{model_name}_parallel_coordinate.png")

    # Analyze and plot sensitivity
    sensitivity = analyze_hyperparameter_sensitivity(study)
    sensitivity_fig = plot_sensitivity_analysis(sensitivity)
    sensitivity_fig.write_image(f"{config['model']['output_dir']}/{model_name}_sensitivity_analysis.png")

    # Print sensitivity analysis results
    print(f"\nHyperparameter Sensitivity Analysis for {model_name}:")
    for param, sens in sensitivity:
        print(f"{param}: {sens:.4f}")


Hyperparameter Sensitivity Analysis for combined_study:
gradient_accumulation_steps: 0.6329
weight_decay: 0.5228
intent_loss_weight: 0.5163
warmup_steps: 0.3905
encoder_lr: 0.2217
decoder_lr: 0.1756
num_train_epochs: 0.1456
per_device_train_batch_size: 0.0577
slot_loss_weight: 0.0524


In [65]:
logger.info("Hyperparameter analysis complete. Plots saved in output directory.")

Hyperparameter analysis complete. Plots saved in output directory.


In [67]:
# Initialize the Accelerator
ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
accelerator = Accelerator(mixed_precision='no', kwargs_handlers=[ddp_kwargs])

In [99]:
# Initialize trainers with the best hyperparameters from the studies
trainers = {}

if 'combined_study' in studies and studies['combined_study'] is not None:
    # Start with the base config
    combined_config = config.copy()
    # Add the best parameters from the study
    combined_config.update(studies['combined_study'].best_params)
    combined_config['per_device_eval_batch_size'] = 1
    combined_config['max_grad_norm'] = 1

    # Add required parameters if they're not already present
    if 'num_intent_classes' not in combined_config:
        combined_config['num_intent_classes'] = 50  # Replace with the actual number of intent classes
    
    if 'num_slot_classes' not in combined_config:
        combined_config['num_slot_classes'] = 100  # Replace with the actual number of slot classes

    trainers['combined_afro_xlmr_llama'] = CombinedEncoderDecoderTrainer(
        encoder=models['afro-xlmr-large'],  # Afro-XLMR as encoder
        decoder=models['meta-llama/Llama-2-7b-hf'],  # LLaMA as decoder
        encoder_tokenizer=tokenizers['afro-xlmr-large'],
        decoder_tokenizer=tokenizers['meta-llama/Llama-2-7b-hf'],
        config=combined_config,
        accelerator=accelerator,
        batch_size=64  # or whatever batch size you prefer
    )
else:
    logger.warning("Study results for 'combined_study' not found or optimization failed. "
                   "CombinedEncoderDecoderTrainer will not be initialized.")

Initialized CombinedEncoderDecoderTrainer with config: {'model': {'names': ['afro-xlmr-large', 'meta-llama/Llama-2-7b-hf'], 'output_dir': './model_outputs'}, 'data': {'masakhane_dir': '../Datasets/Masakhane', 'ontonotes_dir': '../Datasets/OntoNotes_5.0', 'flores_dir': '../Datasets/FLORES-200', 'experiments_dir': '../Datasets/Experiments'}, 'training': {'batch_size': 8, 'learning_rate': '2e-5', 'num_epochs': 3, 'warmup_steps': 500, 'combined_study': {'encoder_lr': 2.994471116556308e-06, 'decoder_lr': 3.3477388089161037e-06, 'num_train_epochs': 2, 'per_device_train_batch_size': 1, 'weight_decay': 0.09304895986444242, 'warmup_steps': 487, 'gradient_accumulation_steps': 128, 'intent_loss_weight': 0.9761991671313265, 'slot_loss_weight': 0.43081143618503664}}, 'evaluation': {'metrics': ['accuracy', 'precision', 'recall', 'f1']}, 'dataset_split': {'train_ratio': 0.7, 'val_ratio': 0.15, 'test_ratio': 0.15}, 'hyperparameters': {'learning_rate_min': '1e-6', 'learning_rate_max': '1e-4', 'num_trai

In [103]:
# Perform evaluations
results = {
    'classification': {},
    'translation': {},
    'generation': {},
    'zero_shot': {},
    'code_switch': {},
    'intent_recognition': {}, 
    'slot_filling': {}, 
    'hyperparameter_studies': studies
}

In [108]:
# Train models with the best hyperparameters
for model_name, trainer in trainers.items():
    logger.info(f"Starting training for model: {model_name}")
    
    # Determine the correct dataset keys
    train_dataset_key = 'train'
    eval_dataset_key = 'eval'
    
    # Ensure the required datasets exist
    if train_dataset_key not in datasets['combined_afro_xlmr_llama'] or eval_dataset_key not in datasets['combined_afro_xlmr_llama']:
        logger.error(f"Required datasets not found for {model_name}. Skipping this model.")
        continue

    train_dataset = datasets['combined_afro_xlmr_llama'][train_dataset_key]
    eval_dataset = datasets['combined_afro_xlmr_llama'][eval_dataset_key]

    logger.info(f"Train dataset size: {len(train_dataset)}, Eval dataset size: {len(eval_dataset)}")
    
    try:
        # Train the model
        train_results = trainer.train(train_dataset, eval_dataset)
        
        # Log training results
        logger.info(f"Training completed for {model_name}")
        logger.info(f"Training results: {train_results}")
        
        # Save the trained model
        save_dir = os.path.join(config['model']['output_dir'], model_name)
        os.makedirs(save_dir, exist_ok=True)
        # trainer.save_model(save_dir)
        logger.info(f"Model saved to {save_dir}")
        
        # Evaluate the model
        eval_results = trainer.evaluate(eval_dataset)
        logger.info(f"Evaluation results for {model_name}: {eval_results}")
        
        # You might want to save these results to a file or database
        
    except Exception as e:
        logger.error(f"Error during training or evaluation of {model_name}: {str(e)}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        continue  # Move to the next model if there's an error

logger.info("Training process completed for all models.")

Starting training for model: combined_afro_xlmr_llama
Train dataset size: 1050, Eval dataset size: 225
Encoder parameters: 560,711,457
Decoder parameters: 6,738,415,616
Total parameters: 7,299,127,073
Encoder dtype: torch.bfloat16
Decoder dtype: torch.bfloat16
Training dtype: torch.bfloat16
GPU memory allocated: 59.00 GB
GPU memory cached: 59.18 GB
Validating Training dataset:
Batch 0 shapes:
  input_ids: torch.Size([1, 128])
  attention_mask: torch.Size([1, 128])
  labels: torch.Size([1, 128])
Batch 1 shapes:
  input_ids: torch.Size([1, 128])
  attention_mask: torch.Size([1, 128])
  labels: torch.Size([1, 128])
Batch 2 shapes:
  input_ids: torch.Size([1, 128])
  attention_mask: torch.Size([1, 128])
  labels: torch.Size([1, 128])
Batch 3 shapes:
  input_ids: torch.Size([1, 128])
  attention_mask: torch.Size([1, 128])
  labels: torch.Size([1, 128])
Batch 4 shapes:
  input_ids: torch.Size([1, 128])
  attention_mask: torch.Size([1, 128])
  labels: torch.Size([1, 128])
Finished validating 

Starting training process
CUDA Memory (Start of training) - Allocated: 54.95GB (Max: 70.33GB), Reserved: 55.12GB (Max: 71.27GB)
Total training steps: 2100


Starting train_step for batch 2
Starting train_step for batch 3
Starting train_step for batch 4
Starting train_step for batch 5
Starting train_step for batch 6
Starting train_step for batch 7
Starting train_step for batch 8
Starting train_step for batch 9
Starting train_step for batch 10
Starting train_step for batch 11
Starting train_step for batch 12
Starting train_step for batch 13
Starting train_step for batch 14
Starting train_step for batch 15
Starting train_step for batch 16
Starting train_step for batch 17
Starting train_step for batch 18
Starting train_step for batch 19
Starting train_step for batch 20
Starting train_step for batch 21
Starting train_step for batch 22
Starting train_step for batch 23
Starting train_step for batch 24
Starting train_step for batch 25
Starting train_step for batch 26
Starting train_step for batch 27
Starting train_step for batch 28
Starting train_step for batch 29
Starting train_step for batch 30
Starting train_step for batch 31
Starting train_ste

CUDA Memory (End of training) - Allocated: 54.95GB (Max: 70.33GB), Reserved: 55.12GB (Max: 71.27GB)
Training completed


Evaluating: Batch 10/225
Evaluating: Batch 20/225
Evaluating: Batch 30/225
Evaluating: Batch 40/225
Evaluating: Batch 50/225
Cleared GPU memory. Current allocated: 59.00 GB
Evaluating: Batch 60/225
Evaluating: Batch 70/225
Evaluating: Batch 80/225
Evaluating: Batch 90/225
Evaluating: Batch 100/225
Cleared GPU memory. Current allocated: 59.00 GB
Evaluating: Batch 110/225
Evaluating: Batch 120/225
Evaluating: Batch 130/225
Evaluating: Batch 140/225
Evaluating: Batch 150/225
Cleared GPU memory. Current allocated: 59.00 GB
Evaluating: Batch 160/225
Evaluating: Batch 170/225
Evaluating: Batch 180/225
Evaluating: Batch 190/225
Evaluating: Batch 200/225
Cleared GPU memory. Current allocated: 59.00 GB
Evaluating: Batch 210/225
Evaluating: Batch 220/225
Average evaluation loss: 10.053349
Intent recognition accuracy: 1.0000
Slot filling F1 score: 1.0000
Example 1:
  True Intent: 27, Predicted Intent: 27
  True Slots: [30]
  Predicted Slots: [30]
Example 2:
  True Intent: 27, Predicted Intent: 27

In [124]:
print(datasets['combined_afro_xlmr_llama']['benchmark'][0])

{'input_ids': tensor([     0,   3664, 111868,   1704,  14770,    208,  60657,  15623,  33416,
           156,    156,   7978,    842,   2866,  36155,   2387,     80,     75,
         38269,    151,  59085,  19915,   2527,    429,    708,  14770,    177,
         44190,   3613,  28821,   3584,     36,   8789,   6990,    156,   7978,
           842,   2866,  36155,   2387,     80,  16670,      2,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,   

In [149]:
print(stratified_datasets['experimental']['code_switch'])

                                                text  \
0    I want to fly from Nairobi to Nueva York mañana   
1  Reserva un vuelo from Nairobi to London por favor   
2      Book a flight desde Nairobi a París next week   
3             Quiero un hotel en Tokyo para mi viaje   
4              Find me a restaurante in Berlin bitte   

                                  label language      split  
0  O O O O O B-LOC O B-LOC I-LOC B-DATE      eng  zero_shot  
1             O O O O B-LOC O B-LOC O O      eng  zero_shot  
2   O O O O B-LOC O B-LOC B-DATE I-DATE      eng  zero_shot  
3                   O O O O B-LOC O O O      eng  zero_shot  
4                     O O O O O B-LOC O      eng  zero_shot  


In [156]:
# Evaluation for all models
for model_name, model in models.items():
    tokenizer = tokenizers['afro-xlmr-large']
    evaluator = evaluators['combined_afro_xlmr_llama']  # Assuming all evaluators are AfriCOMETEvaluator

    try:
         # Use the FLORES data that's already loaded
        if 'benchmark' in stratified_datasets:
            results['translation'][model_name] = evaluate_translation(model, tokenizer, stratified_datasets['benchmark'], evaluator)
        else:
            logger.warning(f"FLORES data not available for model: {model_name}")
        
        # Zero-shot evaluation
        if 'experimental' in stratified_datasets and 'zero_shot' in stratified_datasets['experimental']:
            zero_shot_data = stratified_datasets['experimental']['zero_shot']
            results['zero_shot'][model_name] = evaluate_code_switch(zero_shot_data, code_switch_classifier)
        else:                                    
            logger.info(f"No zero-shot data for model: {model_name}")
        
        # Code-switch evaluation
        if 'experimental' in stratified_datasets and 'code_switch' in stratified_datasets['experimental']:
            code_switch_data = stratified_datasets['experimental']['code_switch']
            results['code_switch'][model_name] = evaluate_zero_shot(code_switch_data, zero_shot_classifier)
        else:
            logger.info(f"No code-switch data for model: {model_name}")
        
        logger.info(f"Completed evaluation for {model_name}")
        
        logger.info(f"Completed evaluation for {model_name}")
    except Exception as e:
        logger.error(f"Error during evaluation of {model_name}: {str(e)}")
        continue  # Move to the next model if there's an error


FLORES-200 'text' dataset shape: (225,)
Error during evaluation of afro-xlmr-large: 'src_lang'
FLORES-200 'text' dataset shape: (225,)
Error during evaluation of meta-llama/Llama-2-7b-hf: 'src_lang'


In [157]:
print(results)

{'classification': {}, 'translation': {'afro-xlmr-large': {'translations': {}}, 'meta-llama/Llama-2-7b-hf': {'translations': {}}}, 'generation': {}, 'zero_shot': {}, 'code_switch': {}, 'intent_recognition': {}, 'slot_filling': {}, 'hyperparameter_studies': {'combined_study': <optuna.study.study.Study object at 0x7f8f262249d0>}}


In [79]:
# Summarize and log results
results_summary = summarize_results(results, config)
plot_results(results, config)
log_results_to_mlflow(results, config, best_params)

Summarizing results...
Evaluation Results Summary:
                   afro-xlmr-large meta-llama/Llama-2-7b-hf
translation                   None                     None
intent_recognition            None                     None
slot_filling                  None                     None
zero_shot                     None                     None
code_switch                   None                     None


NameError: name 'summary_df' is not defined

<Figure size 1200x800 with 0 Axes>

<Figure size 1200x800 with 0 Axes>

In [None]:
logger.info("Evaluation complete!")
print("Evaluation completed successfully. Results and visualizations have been saved and logged to MLflow.")

In [None]:
# Display results and visualizations
display(Image(filename=f"{config['model']['output_dir']}/overall_performance_heatmap.png"))

In [None]:
# Display hyperparameter optimization results
for model_name in config['model']['names']:
    print(f"\nHyperparameter Optimization Results for {model_name}:")
    display(Image(filename=f"{config['model']['output_dir']}/{model_name}_hyperparameter_importance.png"))
    display(Image(filename=f"{config['model']['output_dir']}/{model_name}_optimization_history.png"))
    display(Image(filename=f"{config['model']['output_dir']}/{model_name}_parallel_coordinate.png"))
    display(Image(filename=f"{config['model']['output_dir']}/{model_name}_sensitivity_analysis.png"))

In [None]:
print_results_summary(results_summary, best_params)

In [None]:
print("Evaluation notebook execution complete.")