In [34]:
# !pip install unbabel-comet

In [35]:
# !pip install matplotlib 

In [36]:
# !pip install seaborn

In [37]:
# !pip install mlflow --ignore-installed embedchain

In [38]:
# !pip install optuna-integration lightning 

In [39]:
# !pip install accelerate 

In [40]:
# !pip install torch transformers datasets peft 

In [41]:
# !pip install nltk 

In [42]:
# !pip install plotly

In [43]:
# !pip install -U kaleido

Install Vertex AI SDK

In [None]:
# !pip install --upgrade --user --quiet google-cloud-aiplatform

In [None]:
# Import necessary libraries and modules
import os
os.environ['GIT_PYTHON_REFRESH'] = 'quiet'  # Suppress Git warnings
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"  # Enable CUDA launch blocking for debugging
os.environ['TORCH_USE_CUDA_DSA'] = "1"  # Enable CUDA device-side assertions
os.environ['MLFLOW_FLATTEN_PARAMS'] = 'true' # Flatten parameters parameters for logging
os.environ['WANDB_DISABLED'] = 'true'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'  # Suppress info messages

import sys
sys.path.append('../py')  # Add the parent directory to the Python path
import gc
import torch
print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"Current GPU memory usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import mlflow
import optuna
import json
import time
import traceback
from IPython.display import Image, display
from accelerate import Accelerator, DistributedDataParallelKwargs
from packaging import version

from dataset_loader import DatasetLoader
from utils import set_seed, load_config, get_device, CustomDataset
from models.llama2_decoder import Llama2Decoder  # Import Llama2Decoder model
from models.afro_xlmr_large import AfroXLMRLarge  # Import AfroXLMRLarge model
from models.gemini import GeminiModel # Import Gemini model
from evaluators.africomet_evaluator import AfriCOMETEvaluator
from classifiers.zeroshot_classifier import ZeroShotClassifier
from classifiers.codeswitch_classifier import CodeSwitchClassifier
from classifiers.zeroshot_classifier_for_gemini import ZeroShotClassifierForGemini
from classifiers.codeswitch_classifier_for_gemini import CodeSwitchClassifierForGemini
from trainers.combined_encoder_decoder_trainer import CombinedEncoderDecoderTrainer
from trainers.gemini_trainer import GeminiTrainer
from hyperparameter_analysis import (plot_hyperparameter_importance, plot_study_optimization_history,
                                     plot_parallel_coordinate, analyze_hyperparameter_sensitivity,
                                     plot_sensitivity_analysis)

In [45]:
def setup_logging(config):
    """
    Set up logging configuration based on the provided config.
    
    This function initializes the logging system with the specified log level,
    format, and output file from the configuration.

    Args:
        config (dict): A dictionary containing logging configuration.

    Returns:
        logging.Logger: Configured logger object.
    """
    logging.basicConfig(
        level=getattr(logging, config['logging']['log_level']),
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        filename=config['logging']['log_file']
    )
    return logging.getLogger(__name__)

In [46]:
def clear_memory():
    """
    Clear unused memory to prevent out-of-memory errors.
    
    This function uses Python's garbage collector and PyTorch's CUDA memory 
    cache clearing (if available) to free up memory.
    """
    gc.collect()
    torch.cuda.empty_cache()
    logging.info(f"Cleared GPU memory. Current allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")

In [47]:
def load_datasets(data_loader):
    """
    Load datasets using the provided DatasetLoader object.
    
    This function attempts to load all datasets specified in the configuration
    using the DatasetLoader. It includes error handling for common issues.

    Args:
        data_loader (DatasetLoader): An instance of the DatasetLoader class.

    Returns:
        dict or None: A dictionary of loaded datasets, or None if loading fails.
    """
    try:
        logging.info("Loading and preparing datasets...")
        return data_loader.load_datasets()
    except Exception as e:
        logging.error(f"Error loading datasets: {str(e)}")
        return None

In [48]:
def objective_combined(trial, encoder, decoder, encoder_tokenizer, decoder_tokenizer, train_dataset, eval_dataset, evaluator):
    """
    Objective function for hyperparameter optimization of the combined Afro-XLMR and LLaMA model, including intent recognition and slot filling tasks.

    Args:
        trial (optuna.trial.Trial): The Optuna trial object used for hyperparameter suggestions.
        encoder (torch.nn.Module): The Afro-XLMR encoder model to be trained and evaluated.
        decoder (torch.nn.Module): The LLaMA decoder model to be trained and evaluated.
        encoder_tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for the encoder.
        decoder_tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for the decoder.
        train_dataset (Dataset): The dataset used for training the model.
        eval_dataset (Dataset): The dataset used for evaluating the model.
        evaluator (Evaluator): The evaluator object used to compute evaluation metrics.

    Returns:
        float: The evaluation metric to be minimized (lower is better).
    """
    # Initialize Accelerator
    ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
    accelerator = Accelerator(mixed_precision='no', kwargs_handlers=[ddp_kwargs])
    
    # Hyperparameter suggestions based on config
    try:
        hyperparams = config.get('hyperparameters', {})
        
        # Learning rates
        lr_min = float(hyperparams.get('learning_rate_min', 1e-6))
        lr_max = float(hyperparams.get('learning_rate_max', 1e-4))
        
        if lr_min >= lr_max:
            logging.error(f"Invalid learning rate range: min ({lr_min}) must be less than max ({lr_max})")
            return float('inf')
        
        encoder_lr = trial.suggest_float('encoder_lr', lr_min, lr_max, log=True)
        decoder_lr = trial.suggest_float('decoder_lr', lr_min, lr_max, log=True)
        
        # Number of training epochs
        num_train_epochs = trial.suggest_int('num_train_epochs', 
                                             int(hyperparams.get('num_train_epochs_min', 1)), 
                                             int(hyperparams.get('num_train_epochs_max', 3)))
        
        per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', 
                                                                hyperparams.get('batch_sizes', [1, 2, 4, 8]))
        
        weight_decay = trial.suggest_float('weight_decay', 
                                           float(hyperparams.get('weight_decay_min', 0.01)), 
                                           float(hyperparams.get('weight_decay_max', 0.1)), 
                                           log=True)
        
        warmup_steps = trial.suggest_int('warmup_steps', 
                                         int(hyperparams.get('warmup_steps_min', 0)), 
                                         int(hyperparams.get('warmup_steps_max', 1000)))
        
        gradient_accumulation_steps = trial.suggest_categorical('gradient_accumulation_steps', 
                                                                hyperparams.get('gradient_accumulation_steps', [1, 2, 4, 8]))
        
        # Add hyperparameters for intent and slot tasks
        intent_loss_weight = trial.suggest_float('intent_loss_weight', 0.1, 1.0)
        slot_loss_weight = trial.suggest_float('slot_loss_weight', 0.1, 1.0)

        # Create trial config
        trial_config = {
            "encoder_lr": encoder_lr,
            "decoder_lr": decoder_lr,
            "num_train_epochs": num_train_epochs,
            "per_device_train_batch_size": per_device_train_batch_size,
            "per_device_eval_batch_size": per_device_train_batch_size,
            "weight_decay": weight_decay,
            "warmup_steps": warmup_steps,
            "gradient_accumulation_steps": gradient_accumulation_steps,
            "fp16": False,
            "evaluation_strategy": "steps",
            "eval_steps": 200,
            "save_steps": 200,
            "logging_steps": 50,
            "max_grad_norm": 1.0,
            "output_dir": config['model']['output_dir'],
            "seed": config['seed'],
            "device": config['device'],
            "cache_dir": config['cache']['dir'],
            "gradient_checkpointing": True,  
            "intent_loss_weight": intent_loss_weight, 
            "slot_loss_weight": slot_loss_weight,  
            "num_intent_classes": 50,  
            "num_slot_classes": 100  
        }
    
        logging.info(f"Trial {trial.number}: Starting with hyperparameters: {trial_config}")
        
    except ValueError as e:
        logging.error(f"Error converting config values to numeric types: {str(e)}")
        return float('inf')
    except Exception as e:
        logging.error(f"Error in hyperparameter suggestion: {str(e)}")
        return float('inf')

    # Enable gradient checkpointing for both encoder and decoder
    encoder.gradient_checkpointing_enable()
    decoder.gradient_checkpointing_enable()
    
    # Initialize the trainer
    trainer = CombinedEncoderDecoderTrainer(encoder, decoder, encoder_tokenizer, decoder_tokenizer, config=trial_config, accelerator=accelerator, batch_size=64)

    # Prepare models and optimizers
    trainer.encoder, trainer.decoder, trainer.projection, trainer.encoder_optimizer, trainer.decoder_optimizer = accelerator.prepare(
        trainer.encoder, trainer.decoder, trainer.projection, trainer.encoder_optimizer, trainer.decoder_optimizer
    )

    # Log dataset sizes
    logging.info(f"Train dataset size: {len(train_dataset)}")
    logging.info(f"Eval dataset size: {len(eval_dataset)}")

    # Validate datasets
    trainer.validate_dataset(train_dataset, "Training")
    trainer.validate_dataset(eval_dataset, "Evaluation")

    best_metric = float('inf')  # Initialize for early stopping
    patience_counter = 0
    patience_threshold = trainer.patience  # Use the patience from the trainer configuration

    # Train the combined model
    for epoch in range(num_train_epochs):
        logging.info(f"Starting epoch {epoch + 1}/{num_train_epochs}")
        train_result = trainer.train(train_dataset, eval_dataset)

        # Clear cache after training
        torch.cuda.empty_cache()
        
        # Evaluate the model after each epoch
        try:
            # Use the evaluate() method here
            eval_metrics = trainer.evaluate(eval_dataset)
            
            # Generate translations for FLORES evaluation
            generated_results = trainer.generate_batch(eval_dataset)
            translated_texts, _, _ = zip(*generated_results)  # We only need translated texts here

            # Extract source texts and reference texts for FLORES evaluation
            source_texts = [encoder_tokenizer.decode(eval_dataset[i]['input_ids'], skip_special_tokens=True) for i in range(len(eval_dataset))]
            reference_texts = [encoder_tokenizer.decode(eval_dataset[i]['labels'], skip_special_tokens=True) for i in range(len(eval_dataset))]

            # Evaluate translation with FLORES
            print("Evaluating translation results with Africomet")
            translation_results = evaluator.evaluate(source_texts, translated_texts, reference_texts)
            print("Complete evaluating translation results")

            # Combine all evaluation results
            eval_results = {
                'translation_score': translation_results.get('average_score', 0),
                'intent_accuracy': eval_metrics['intent_accuracy'],
                'slot_f1': eval_metrics['slot_f1'],
                'eval_loss': eval_metrics['eval_loss']
            }

        except Exception as e:
            logging.error(f"Error during evaluation: {str(e)}")
            logging.error(f"Traceback: {traceback.format_exc()}")
            return float('inf')
        
        logging.info(f"Epoch {epoch + 1}/{num_train_epochs} Evaluation results: {eval_results}")

        # Early stopping logic
        current_metric = eval_results['eval_loss']  # Use eval_loss for early stopping
        if current_metric < best_metric:
            best_metric = current_metric
            patience_counter = 0
            # Save the best model state
            best_model_state = {
                'encoder': trainer.encoder.state_dict(),
                'decoder': trainer.decoder.state_dict(),
                'projection': trainer.projection.state_dict()
            }
        else:
            patience_counter += 1
            logging.info(f"Early stopping patience: {patience_counter}/{patience_threshold}")

        if patience_counter >= patience_threshold:
            logging.info("Early stopping triggered.")
            break

    # Load the best model state if it exists
    if 'best_model_state' in locals():
        trainer.encoder.load_state_dict(best_model_state['encoder'])
        trainer.decoder.load_state_dict(best_model_state['decoder'])
        trainer.projection.load_state_dict(best_model_state['projection'])
        logging.info("Loaded best model state")

    # Set trial user attributes for logging
    trial.set_user_attr('intent_accuracy', eval_results['intent_accuracy'])
    trial.set_user_attr('slot_f1', eval_results['slot_f1'])

    # Return the combined metric for optimization (lower is better)
    return eval_results['translation_score'] - (eval_results['intent_accuracy'] + eval_results['slot_f1']) / 2

In [49]:
def run_combined_optimization(encoder, decoder, encoder_tokenizer, decoder_tokenizer, datasets, config, evaluator):
    """
    Run hyperparameter optimization for the combined Afro-XLMR and LLaMA model with improved efficiency, including intent recognition and slot filling tasks.
    """
    mlflow.end_run()  # End any existing runs
    
    logging.info("Starting hyperparameter optimization for combined Afro-XLMR and LLaMA (including intent and slot tasks)")
    log_gpu_memory("Before optimization")

    # Enable gradient checkpointing for both models
    encoder.gradient_checkpointing_enable()
    decoder.gradient_checkpointing_enable()

    # Check PyTorch version
    pytorch_version = version.parse(torch.__version__)
    logger.info(f"PyTorch version: {pytorch_version}")
    
    # Prepare the objective function with fixed arguments and memory management
    def memory_managed_objective(trial):
        clear_memory()
        result = objective_combined(
            trial, encoder, decoder, encoder_tokenizer, decoder_tokenizer,
            datasets['train'], datasets['eval'], evaluator
        )
        clear_memory()
        return result

    total_delay = 0
    n_trials = config['hyperparameters']['n_trials']
    
    try:
        # Set up MLflow
        experiment_name = "combined_optimization"
        try:
            experiment_id = mlflow.create_experiment(experiment_name)
        except mlflow.exceptions.MlflowException:
            experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
        
        mlflow.set_experiment(experiment_name)
        
        with mlflow.start_run(run_name="optimization_combined_with_intent_slot"):
            study = optuna.create_study(direction='minimize')

            for trial_num in range(n_trials):
                start_time = time.time()
                
                study.optimize(
                    memory_managed_objective,
                    n_trials=n_trials, 
                    timeout=3600,
                    catch=(Exception,),
                    n_jobs=1
                )
                
                end_time = time.time()
                trial_time = end_time - start_time
                
                if trial_num < n_trials - 1:  # Don't measure delay after the last trial
                    delay_start = time.time()
                    clear_memory()
                    delay_end = time.time()
                    delay = delay_end - delay_start
                    total_delay += delay
                    logging.info(f"Delay after trial {trial_num + 1}: {delay:.2f} seconds")
                
                logging.info(f"Trial {trial_num + 1} completed in {trial_time:.2f} seconds")
                log_gpu_memory(f"After trial {trial_num + 1}")

                # Log intermediate results
                if study.trials[-1].state == optuna.trial.TrialState.COMPLETE:
                    last_trial = study.trials[-1]
                    logging.info(f"Trial {trial_num + 1} results:")
                    logging.info(f"  Translation score: {last_trial.value:.4f}")
                    
                    intent_accuracy = last_trial.user_attrs.get('intent_accuracy', 'N/A')
                    slot_f1 = last_trial.user_attrs.get('slot_f1', 'N/A')
                    
                    logging.info(f"  Intent accuracy: {intent_accuracy if isinstance(intent_accuracy, str) else f'{intent_accuracy:.4f}'}")
                    logging.info(f"  Slot F1 score: {slot_f1 if isinstance(slot_f1, str) else f'{slot_f1:.4f}'}")
        
            if study.best_trial:
                log_best_params(study)
                
                # Log best results for all tasks
                logging.info("Best trial results:")
                logging.info(f"  Translation score: {study.best_trial.value:.4f}")
                
                best_intent_accuracy = study.best_trial.user_attrs.get('intent_accuracy', 'N/A')
                best_slot_f1 = study.best_trial.user_attrs.get('slot_f1', 'N/A')
                
                logging.info(f"  Intent accuracy: {best_intent_accuracy if isinstance(best_intent_accuracy, str) else f'{best_intent_accuracy:.4f}'}")
                logging.info(f"  Slot F1 score: {best_slot_f1 if isinstance(best_slot_f1, str) else f'{best_slot_f1:.4f}'}")
            else:
                logging.warning("No completed trials found.")
                
    except optuna.exceptions.OptunaError as e:
        logging.error(f"Optuna error during hyperparameter optimization: {str(e)}")
        return None
    except Exception as e:
        logging.error(f"Error during hyperparameter optimization: {str(e)}")
        logging.exception("Exception details:")
        return None
    finally:
        avg_delay = total_delay / (n_trials - 1) if n_trials > 1 else 0
        logging.info(f"Average delay between trials: {avg_delay:.2f} seconds")
        clear_memory()
    
    return study

For testing purposes only

In [50]:
# def run_combined_optimization(encoder, decoder, encoder_tokenizer, decoder_tokenizer, datasets, config, evaluator):
#     """
#     Run a single trial with a small batch for testing purposes.
#     """
#     mlflow.end_run()  # End any existing runs
    
#     logging.info("Starting single trial test for combined Afro-XLMR and LLaMA (including intent and slot tasks)")
#     log_gpu_memory("Before optimization")

#     # Enable gradient checkpointing for both models
#     encoder.gradient_checkpointing_enable()
#     decoder.gradient_checkpointing_enable()

#     # Prepare the objective function with fixed arguments and memory management
#     def memory_managed_objective(trial):
#         clear_memory()
#         result = objective_combined(
#             trial, encoder, decoder, encoder_tokenizer, decoder_tokenizer,
#             datasets['train'], datasets['eval'], evaluator
#         )
#         clear_memory()
#         return result

#     try:
#         # Set up MLflow
#         experiment_name = "combined_optimization_test"
#         try:
#             experiment_id = mlflow.create_experiment(experiment_name)
#         except mlflow.exceptions.MlflowException:
#             experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
        
#         mlflow.set_experiment(experiment_name)
        
#         with mlflow.start_run(run_name="single_trial_test"):
#             study = optuna.create_study(direction='minimize')

#             # Run a single trial
#             study.optimize(
#                 memory_managed_objective,
#                 n_trials=1,
#                 timeout=3600,
#                 catch=(Exception,),
#                 n_jobs=1
#             )

#             if study.trials[-1].state == optuna.trial.TrialState.COMPLETE:
#                 last_trial = study.trials[-1]
#                 logging.info("Trial results:")
#                 logging.info(f"  Translation score: {last_trial.value:.4f}")
#                 intent_accuracy = last_trial.user_attrs.get('intent_accuracy', 'N/A')
#                 slot_f1 = last_trial.user_attrs.get('slot_f1', 'N/A')
#                 logging.info(f"  Intent accuracy: {intent_accuracy if isinstance(intent_accuracy, str) else f'{intent_accuracy:.4f}'}")
#                 logging.info(f"  Slot F1 score: {slot_f1 if isinstance(slot_f1, str) else f'{slot_f1:.4f}'}")
#             else:
#                 logging.warning("Trial was not completed successfully.")

#     except Exception as e:
#         logging.error(f"Error during single trial test: {str(e)}")
#         logging.exception("Exception details:")
#         return None
#     finally:
#         clear_memory()
    
#     return study

In [51]:
def log_gpu_memory(stage):
    """Log GPU memory information."""
    if torch.cuda.is_available():
        current_device = torch.cuda.current_device()
        total_memory = torch.cuda.get_device_properties(current_device).total_memory
        allocated_memory = torch.cuda.memory_allocated(current_device)
        reserved_memory = torch.cuda.memory_reserved(current_device)
        available_memory = total_memory - allocated_memory - reserved_memory
        logging.info(f"{stage} - GPU Memory (GB): "
                    f"Total: {total_memory / 1e9:.2f}, "
                    f"Allocated: {allocated_memory / 1e9:.2f}, "
                    f"Reserved: {reserved_memory / 1e9:.2f}, "
                    f"Available: {available_memory / 1e9:.2f}")
    else:
        logging.info("CUDA is not available. Cannot log GPU memory.")



In [52]:
def log_best_params(study):
    """Log best parameters from the study."""
    best_params = study.best_params
    for param, value in best_params.items():
        mlflow.log_param(f"best_{param}", value)
    mlflow.log_metric("best_score", study.best_value)



In [53]:
def save_results(model_name, study, config):
    """Save the optimization results to disk."""
    output_dir = os.path.join(config['model']['output_dir'], 'optimization_results')
    os.makedirs(output_dir, exist_ok=True)
    result = {
        'best_params': study.best_params,
        'best_value': study.best_value,
        'best_trial': study.best_trial.number
    }
    with open(os.path.join(output_dir, f"{model_name}_optimization_results.json"), 'w') as f:
        json.dump(result, f, indent=2)

In [54]:
def print_best_trial_info(trial):
    """Print detailed information about the best trial."""
    print("\nBest Trial Information:")
    print(f"  Value: {trial.value}")
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    print(f"  Trial number: {trial.number}")
    print(f"  DateTime start: {trial.datetime_start}")
    print(f"  DateTime complete: {trial.datetime_complete}")

In [58]:
def safe_call(obj, method_name, *args, **kwargs):
    """
    Safely calls a method on the given object by its name.

    Parameters:
    obj (object or tuple): The object on which the method is to be called. 
                           If a tuple is passed, the function extracts the first element of the tuple.
    method_name (str): The name of the method to call on the object.
    *args: Additional positional arguments to pass to the method.
    **kwargs: Additional keyword arguments to pass to the method.

    Returns:
    result: The result of the method call.

    Raises:
    AttributeError: If the method does not exist on the object.
    """

    # If the object is a tuple, extract the first element as the actual target object.
    if isinstance(obj, tuple):
        obj = obj[0]

    # Use getattr to dynamically call the method by its name and pass additional arguments and keyword arguments.
    return getattr(obj, method_name)(*args, **kwargs)

In [59]:
def analyze_model_requirements(encoder, decoder, encoder_tokenizer, decoder_tokenizer):
    """
    Analyzes and logs the requirements for the encoder and decoder models, as well as their tokenizers.

    Parameters:
    encoder (object): The encoder model, usually part of a sequence-to-sequence architecture.
    decoder (object or tuple): The decoder model, or a tuple of decoder models. The function extracts the first element if a tuple is passed.
    encoder_tokenizer (object or tuple): The tokenizer used with the encoder model. If a tuple is passed, the function extracts the first element.
    decoder_tokenizer (object or tuple): The tokenizer used with the decoder model. If a tuple is passed, the function extracts the first element.

    Returns:
    dict: A dictionary containing:
        - 'encoder_input_shapes': The shapes of the tokenized input for the encoder.
        - 'expected_encoder_output_shape': The expected shape of the encoder's output.
        - 'decoder_input_shapes': The shapes of the tokenized input for the decoder.
        - 'expected_decoder_input_shape': The expected shape of the decoder's input.
        - 'decoder_generate_params': The parameter names of the decoder's `generate()` method.

    Notes:
    The function logs various aspects of the encoder and decoder configuration, including hidden sizes,
    maximum position embeddings, and input shapes. It also catches exceptions during tokenization and
    when accessing the decoder's `generate()` method.
    """

    # Log the encoder model class name
    logging.info(f"Encoder model: {encoder.__class__.__name__}")
    
    # Handle the case where the decoder is passed as a tuple and extract the first element
    if isinstance(decoder, tuple):
        logging.info(f"Decoder is a tuple of length {len(decoder)}")
        decoder = decoder[0]
    logging.info(f"Decoder model: {decoder.__class__.__name__}")

    # Handle the case where the encoder tokenizer is passed as a tuple and extract the first element
    if isinstance(encoder_tokenizer, tuple):
        logging.info(f"Encoder tokenizer is a tuple of length {len(encoder_tokenizer)}")
        encoder_tokenizer = encoder_tokenizer[0]
    logging.info(f"Encoder tokenizer: {encoder_tokenizer.__class__.__name__}")

    # Handle the case where the decoder tokenizer is passed as a tuple and extract the first element
    if isinstance(decoder_tokenizer, tuple):
        logging.info(f"Decoder tokenizer is a tuple of length {len(decoder_tokenizer)}")
        decoder_tokenizer = decoder_tokenizer[0]
    logging.info(f"Decoder tokenizer: {decoder_tokenizer.__class__.__name__}")

    # Analyze and log the encoder's configuration
    logging.info("Analyzing encoder requirements:")
    encoder_config = encoder.config
    logging.info(f"Encoder hidden size: {encoder_config.hidden_size}")
    logging.info(f"Encoder max position embeddings: {encoder_config.max_position_embeddings}")
    
    # Try to tokenize a sample text using the encoder tokenizer and log input shapes
    sample_text = "This is a sample input text."
    try:
        encoder_inputs = safe_call(encoder_tokenizer, '__call__', sample_text, return_tensors="pt")
        logging.info("Encoder input shapes:")
        for key, value in encoder_inputs.items():
            logging.info(f"  {key}: {value.shape}")
    except Exception as e:
        logging.error(f"Error in encoder tokenization: {str(e)}")

    # Analyze and log the decoder's configuration
    logging.info("\nAnalyzing decoder requirements:")
    decoder_config = decoder.config
    logging.info(f"Decoder hidden size: {decoder_config.hidden_size}")
    logging.info(f"Decoder max position embeddings: {decoder_config.max_position_embeddings}")

    # Try to tokenize a sample text using the decoder tokenizer and log input shapes
    try:
        decoder_inputs = safe_call(decoder_tokenizer, '__call__', sample_text, return_tensors="pt")
        logging.info("Decoder input shapes:")
        for key, value in decoder_inputs.items():
            logging.info(f"  {key}: {value.shape}")
    except Exception as e:
        logging.error(f"Error in decoder tokenization: {str(e)}")

    # Attempt to log the parameters of the decoder's generate() method
    logging.info("\nDecoder generate() method parameters:")
    try:
        generate_params = decoder.generate.__code__.co_varnames
        logging.info(f"Parameters: {generate_params}")
    except Exception as e:
        logging.error(f"Error accessing generate method: {str(e)}")

    # Determine and log the expected shape of the encoder's output
    expected_encoder_output_shape = (1, encoder_config.max_position_embeddings, encoder_config.hidden_size)
    logging.info(f"\nExpected encoder output shape: {expected_encoder_output_shape}")
    
    # Determine and log the expected shape of the decoder's input
    expected_decoder_input_shape = (1, decoder_config.max_position_embeddings)
    logging.info(f"Expected decoder input shape: {expected_decoder_input_shape}")

    # Return relevant shapes and parameters as a dictionary
    return {
        "encoder_input_shapes": {k: v.shape for k, v in encoder_inputs.items()} if 'encoder_inputs' in locals() else None,
        "expected_encoder_output_shape": expected_encoder_output_shape,
        "decoder_input_shapes": {k: v.shape for k, v in decoder_inputs.items()} if 'decoder_inputs' in locals() else None,
        "expected_decoder_input_shape": expected_decoder_input_shape,
        "decoder_generate_params": generate_params if 'generate_params' in locals() else None
    }

In [60]:
def summarize_results(results, config):
    """
    Summarize evaluation results.
    
    This function summarizes the results from various evaluation tasks and saves
    them to files.

    Args:
        results (dict): A dictionary containing results from all evaluation tasks.
        config (dict): A dictionary containing configuration information.

    Returns:
        pd.DataFrame: A summary of evaluation results.
    """
    logging.info("Summarizing results...")

    summary = {}
    for model_name in config['model']['names']:
        summary[model_name] = {
            'translation': results['translation'].get(model_name),
            'intent_recognition': results['intent_recognition'].get(model_name),
            'slot_filling': results['slot_filling'].get(model_name),
            'zero_shot': results['zero_shot'].get(model_name, {}).get('accuracy') if 'zero_shot' in results else None,
            'code_switch': results['code_switch'].get(model_name, {}).get('accuracy') if 'code_switch' in results else None,
        }

        # Add FLORES results if available
        if 'flores' in results['translation'].get(model_name, {}):
            summary[model_name]['flores_translation'] = results['translation'][model_name]['flores'].get('average_score')

    summary_df = pd.DataFrame(summary)
    logging.info("Evaluation Results Summary:")
    logging.info(summary_df)

    # Save results
    summary_df.to_csv(f"{config['model']['output_dir']}/evaluation_results_summary.csv")
    
    with open(f"{config['model']['output_dir']}/all_results.txt", 'w') as f:
        f.write(str(results))

    # Add this line to add summary to results
    results['summary'] = summary

    return summary_df

In [61]:
def plot_results(results, config):
    """
    Plot evaluation results.
    
    Args:
        results (dict): A dictionary containing results from all evaluation tasks.
        config (dict): A dictionary containing configuration information.
    """
    for model_name in config['model']['names']:
        plt.figure(figsize=(12, 8))
        
        if model_name in results['translation']:
            plt.bar(results['translation'][model_name].keys(), results['translation'][model_name].values())
            plt.title(f"Translation Scores - {model_name}")
            plt.tight_layout()
            plt.savefig(f"{config['model']['output_dir']}/{model_name}_translation_scores.png")
            plt.close()
        
        if model_name in results['generation']:
            plt.hist(results['generation'][model_name]['perplexities'], bins=20)
            plt.title(f"Perplexity Distribution - {model_name}")
            plt.xlabel("Perplexity")
            plt.ylabel("Frequency")
            plt.tight_layout()
            plt.savefig(f"{config['model']['output_dir']}/{model_name}_perplexity_distribution.png")
            plt.close()
        
        # Plot intent recognition confusion matrix
        if model_name in results['intent_recognition']:
            sns.heatmap(results['intent_recognition'][model_name]['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
            plt.title(f"Intent Recognition Confusion Matrix - {model_name}")
            plt.tight_layout()
            plt.savefig(f"{config['model']['output_dir']}/{model_name}_intent_confusion_matrix.png")
            plt.close()

        # Plot slot filling F1 scores
        if model_name in results['slot_filling']:
            plt.bar(results['slot_filling'][model_name]['f1_scores'].keys(), results['slot_filling'][model_name]['f1_scores'].values())
            plt.title(f"Slot Filling F1 Scores - {model_name}")
            plt.tight_layout()
            plt.savefig(f"{config['model']['output_dir']}/{model_name}_slot_f1_scores.png")
            plt.close()


    # Plot overall performance comparison
    plt.figure(figsize=(12, 6))
    performance_data = {
        model: {
            'Classification': results['classification'].get(model, {}).get('accuracy', 0),
            'Translation': results['translation'].get(model, {}).get('average_score', 0),
            'Generation': 1 / results['generation'].get(model, {}).get('average_perplexity', 1),  # Inverse of perplexity
            'Zero-shot': results['zero_shot'].get(model, {}).get('accuracy', 0),
            'Code-switch': results['code_switch'].get(model, {}).get('accuracy', 0),
            'Intent Recognition': results['intent_recognition'].get(model, {}).get('accuracy', 0),  # New
            'Slot Filling': results['slot_filling'].get(model, {}).get('f1_score', 0)  # New
        } for model in config['model']['names']
    }
    df = pd.DataFrame(performance_data).T
    sns.heatmap(df, annot=True, cmap='YlGnBu')
    plt.title("Model Performance Across Tasks")
    plt.tight_layout()
    plt.savefig(f"{config['model']['output_dir']}/overall_performance_heatmap.png")
    plt.close()

In [62]:
def log_results_to_mlflow(results, config, best_params):
    """
    Log results to MLflow.
    
    This function logs the evaluation results, model parameters, and artifacts to MLflow.

    Args:
        results (dict): A dictionary containing results from all evaluation tasks.
        config (dict): A dictionary containing configuration information.
        best_params (dict): A dictionary of the best hyperparameters found during optimization.
    """
    with mlflow.start_run():
        # Log hyperparameters for each model
        for model_name, params in best_params.items():
            for param, value in params.items():
                mlflow.log_param(f"{model_name}_{param}", value)
        
        # Log model information
        for model_name in config['model']['names']:
            mlflow.log_param(f"{model_name}_model", model_name)

        # Log metrics
        for model_name, metrics in results['summary'].items():
            for metric, value in metrics.items():
                if value is not None:
                    mlflow.log_metric(f"{model_name}_{metric}", value)

        # Log artifacts
        mlflow.log_artifact(f"{config['model']['output_dir']}/evaluation_results_summary.csv")
        mlflow.log_artifact(f"{config['model']['output_dir']}/all_results.txt")
        mlflow.log_artifact(f"{config['model']['output_dir']}/overall_performance_heatmap.png")
        
        # Log hyperparameter optimization plots
        for model_name in config['model']['names']:   
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_hyperparameter_importance.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_optimization_history.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_parallel_coordinate.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_sensitivity_analysis.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_intent_confusion_matrix.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_slot_f1_scores.png")

In [63]:
def print_results_summary(results_summary, best_params):
    """
    Print a summary of the evaluation results and best hyperparameters.

    Args:
        results_summary (dict): A dictionary containing summarized results.
        best_params (dict): A dictionary of the best hyperparameters found during optimization.
    """
    print("\n===== EVALUATION RESULTS SUMMARY =====")

    if 'classification' in results_summary:
        print("\nClassification Results:")
        for dataset, metrics in results_summary['classification'].items():
            print(f"\n{dataset}:")
            for metric, value in metrics.items():
                print(f"  {metric}: {value:.4f}")

    if 'translation' in results_summary:
        print("\nTranslation Results:")
        print(f"  FLORES-200 Average AfriCOMET Score (A to B): {results_summary['translation']['a_to_b']['average_score']:.4f}")
        print(f"  FLORES-200 Average AfriCOMET Score (B to A): {results_summary['translation']['b_to_a']['average_score']:.4f}")

    if 'generation' in results_summary:
        print("\nGeneration Results:")
        print(f"  FLORES-200 Average Perplexity: {results_summary['generation']['average_perplexity']:.4f}")

    if 'zero_shot' in results_summary:
        print("\nZero-shot Results:")
        print(f"  Accuracy: {results_summary['zero_shot']['accuracy']:.4f}")

    if 'code_switch' in results_summary:
        print("\nCode-switch Results:")
        print(f"  Accuracy: {results_summary['code_switch']['accuracy']:.4f}")

    print("\nBest Hyperparameters:")
    for param, value in best_params.items():
        print(f"  {param}: {value}")

    print("\n======================================")

In [None]:
# Load configuration
config = load_config('../py/config.yaml')
# Set the device dynamically based on availability
config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'  # Update device setting
auth_token = config.get("auth_token")
cache_dir = os.path.abspath(config['cache']['dir'])
logger = setup_logging(config)
set_seed(config['seed'])
device = get_device(config['device'])
logger.info(f"Using device: {device}")

In [65]:
# Ensure the cache directory exists
os.makedirs(cache_dir, exist_ok=True)

In [None]:
# Load and prepare datasets
data_loader = DatasetLoader(config)


In [None]:
stratified_datasets = data_loader.prepare_stratified_datasets()

In [None]:
for split, dataset in stratified_datasets.items():
    print(f"{split} dataset size: {len(dataset)}")

In [None]:
# Verify data integrity
if not data_loader.verify_data_integrity(stratified_datasets):
    logger.error("Data integrity check failed. Please review the datasets.")
    sys.exit(1)

In [None]:
# Print dataset information
data_loader.print_dataset_info(stratified_datasets)

In [None]:
# Inspect the structure of stratified_datasets
print("Structure of stratified_datasets:")
print(data_loader.inspect_dataset_structure(stratified_datasets))

In [72]:
# Preprocess datasets
preprocessed_datasets = {
    key: data_loader.preprocess_dataset(dataset)
    for key, dataset in stratified_datasets.items()
}

In [None]:
# Inspect the structure of preprocessed_datasets
print("\nStructure of preprocessed_datasets:")
print(data_loader.inspect_dataset_structure(preprocessed_datasets))

In [None]:
# Check if the models directory is in the Python path
models_dir = os.path.abspath(os.path.join('..', 'py', 'models'))
if models_dir not in sys.path:
    sys.path.append(models_dir)
    print(f"Added {models_dir} to Python path")

In [None]:
# Print contents of the models directory
print(f"Models directory contents: {os.listdir(models_dir)}")

In [76]:
# Initialize models
models, tokenizers = {}, {}
num_labels = len(preprocessed_datasets['train']['label'].unique())

In [None]:
# Initialize models with the configured parameters and authentication token
for model_name in config['model']['names']:
    print(f"Initializing model: {model_name}")
    print(f"Cache directory: {cache_dir}")
    print(f"Auth token: {auth_token[:5]}...{auth_token[-5:] if auth_token else None}")
    try:
        if model_name == "meta-llama/Llama-2-7b-hf":
            llama_model = Llama2Decoder(model_name, auth_token=auth_token, cache_dir=cache_dir)
            models[model_name] = llama_model.get_model()   # The model is already on the appropriate device
            tokenizers[model_name] = llama_model.get_tokenizer()
        
        elif model_name == "afro-xlmr-large":
            afro_xlmr_large_model = AfroXLMRLarge(model_name, num_labels=num_labels, cache_dir=cache_dir)
            models[model_name] = afro_xlmr_large_model.get_model()   # The model is already on the appropriate device
            tokenizers[model_name] = afro_xlmr_large_model.get_tokenizer()
        
        elif model_name == "gemini":
            # Initialize GeminiModel with the model name
            gemini_model = GeminiModel(model_name=model_name)

            # Initialize GeminiTrainer with the GeminiModel and config
            gemini_trainer = GeminiTrainer(
                model=gemini_model, 
                config=config
            )

            # Store GeminiModel in models dictionary
            models[model_name] = gemini_model

        if models[model_name] is not None:
            print(f"Successfully initialized {model_name}")
        else:
            print(f"Failed to initialize {model_name}")
    except Exception as e:
        print(f"Error initializing {model_name}: {str(e)}")
        raise

In [None]:
print("Available models:", list(models.keys()))
print("Available tokenizers:", list(tokenizers.keys()))

In [None]:
# Create custom datasets for PyTorch
datasets = {}

model_type_combined = 'encoder_decoder' 
combined_tokenizer = tokenizers['afro-xlmr-large']  
logging.info(f"Creating datasets for combined Afro-XLMR and LLaMA model with model type: {model_type_combined}")

model_type_gemini = 'gemini' 
logging.info(f"Creating datasets for Gemini model with model type: {model_type_gemini}")


# Use the key 'combined_afro_xlmr_llama'
datasets = {
    'combined_afro_xlmr_llama':{
        'train': CustomDataset(preprocessed_datasets['train'], combined_tokenizer, model_type=model_type_combined),
        'eval': CustomDataset(preprocessed_datasets['eval'], combined_tokenizer, model_type=model_type_combined),
        'benchmark': CustomDataset(preprocessed_datasets['benchmark'], combined_tokenizer, model_type=model_type_combined)
    },
    'gemini': {
        'train': CustomDataset(stratified_datasets['train'], gemini_model.count_tokens, model_type=model_type_gemini),
        'eval': CustomDataset(stratified_datasets['eval'], gemini_model.count_tokens, model_type=model_type_gemini),
        'benchmark': CustomDataset(stratified_datasets['benchmark'], gemini_model.count_tokens, model_type=model_type_gemini)
    }
}


In [81]:
# Initialize classifiers
zero_shot_classifier = ZeroShotClassifier(
    encoder=models['afro-xlmr-large'],
    decoder=models['meta-llama/Llama-2-7b-hf'],
    tokenizer=combined_tokenizer
)

code_switch_classifier = CodeSwitchClassifier(
    encoder=models['afro-xlmr-large'],
    decoder=models['meta-llama/Llama-2-7b-hf'],
    tokenizer=combined_tokenizer
)

zero_shot_classifier_gemini = ZeroShotClassifierForGemini(gemini_model)
code_switch_classifier_gemini = CodeSwitchClassifierForGemini(gemini_model)

In [None]:
# Initialize evaluators
evaluators = {
    'combined_afro_xlmr_llama': AfriCOMETEvaluator(
        model=None,  # AfriCOMETEvaluator doesn't use the model directly
        tokenizer=tokenizers['afro-xlmr-large']
    )
}

In [None]:
# Run optimization for combined Afro-XLMR and LLaMA
combined_study = run_combined_optimization(
    models['afro-xlmr-large'],  # Encoder
    models['meta-llama/Llama-2-7b-hf'],  # Decoder
    tokenizers['afro-xlmr-large'], # encoder_tokenizer
    tokenizers['meta-llama/Llama-2-7b-hf'], # decoder_tokenizer
    datasets['combined_afro_xlmr_llama'],  
    config,
    evaluators['combined_afro_xlmr_llama'] 
)

In [None]:
# Run Gemini model optimization
gemini_study = gemini_trainer.run_tuning(training_data=datasets['gemini'])

In [None]:
# Run hyperparameter optimization for combined model
try:
    # Consolidate the results into the studies variable
    studies = {
        'combined_study': combined_study,
        'gemini_study': gemini_study
    }
    
    # Extract best parameters
    best_params = {model_name: study.best_params for model_name, study in studies.items()}
    print(best_params)

    # Log best parameters
    for model_name, params in best_params.items():
        logger.info(f"Best hyperparameters for {model_name}: {params}")
        config['training'][model_name] = params

except Exception as e:
    logger.error(f"Error during or after hyperparameter optimization: {str(e)}")
    logger.exception("Exception details:")
    studies = {}  # Initialize an empty dictionary if optimization failed

In [None]:
# Hyperparameter analysis
logger.info("Performing hyperparameter analysis...")

In [None]:
import plotly.graph_objects as go
import numpy as np

In [None]:
for model_name, study in studies.items():
    print(f"model name: {model_name}, study: {study}")

In [None]:
for model_name, study in studies.items():
    # Plot hyperparameter importance
    importance_fig = plot_hyperparameter_importance(study)
    importance_fig.write_image(f"{config['model']['output_dir']}/{model_name}_hyperparameter_importance.png")

    # Plot optimization history
    history_fig = optuna.visualization.plot_optimization_history(study)
    history_fig.update_layout(title="Optimization History")
    history_fig.write_image(f"{config['model']['output_dir']}/{model_name}_optimization_history.png")

    # Plot parallel coordinate
    dimensions = []
    for param in study.best_trials[0].params:
        values = [trial.params[param] for trial in study.trials if param in trial.params]
        if isinstance(values[0], (int, float)):
            dimensions.append(
                dict(range = [min(values), max(values)],
                     label = param,
                     values = values)
            )
        elif isinstance(values[0], str):
            # For categorical parameters, we need a different approach
            unique_values = list(set(values))
            dimensions.append(
                dict(range = [0, len(unique_values) - 1],
                     tickvals = list(range(len(unique_values))),
                     ticktext = unique_values,
                     label = param,
                     values = [unique_values.index(v) for v in values])
            )

    parallel_fig = go.Figure(data=
        go.Parcoords(
            line = dict(color = [trial.value for trial in study.trials],
                        colorscale = 'Viridis',
                        showscale = True),
            dimensions = dimensions
        )
    )
    parallel_fig.update_layout(title="Parallel Coordinate Plot of Hyperparameters")
    parallel_fig.write_image(f"{config['model']['output_dir']}/{model_name}_parallel_coordinate.png")

    # Analyze and plot sensitivity
    sensitivity = analyze_hyperparameter_sensitivity(study)
    sensitivity_fig = plot_sensitivity_analysis(sensitivity)
    sensitivity_fig.write_image(f"{config['model']['output_dir']}/{model_name}_sensitivity_analysis.png")

    # Print sensitivity analysis results
    print(f"\nHyperparameter Sensitivity Analysis for {model_name}:")
    for param, sens in sensitivity:
        print(f"{param}: {sens:.4f}")

In [None]:
logger.info("Hyperparameter analysis complete. Plots saved in output directory.")

In [None]:
# Initialize the Accelerator
ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
accelerator = Accelerator(mixed_precision='no', kwargs_handlers=[ddp_kwargs])

In [None]:
# Initialize trainers with the best hyperparameters from the studies
trainers = {}

In [None]:
if 'combined_study' in studies and studies['combined_study'] is not None:
    # Start with the base config
    combined_config = config.copy()
    # Add the best parameters from the study
    combined_config.update(studies['combined_study'].best_params)
    combined_config['per_device_eval_batch_size'] = 1
    combined_config['max_grad_norm'] = 1

    # Add required parameters if they're not already present
    if 'num_intent_classes' not in combined_config:
        combined_config['num_intent_classes'] = 50  # Replace with the actual number of intent classes
    
    if 'num_slot_classes' not in combined_config:
        combined_config['num_slot_classes'] = 100  # Replace with the actual number of slot classes

    trainers['combined_afro_xlmr_llama'] = CombinedEncoderDecoderTrainer(
        encoder=models['afro-xlmr-large'],  # Afro-XLMR as encoder
        decoder=models['meta-llama/Llama-2-7b-hf'],  # LLaMA as decoder
        encoder_tokenizer=tokenizers['afro-xlmr-large'],
        decoder_tokenizer=tokenizers['meta-llama/Llama-2-7b-hf'],
        config=combined_config,
        accelerator=accelerator,
        batch_size=64  # or whatever batch size you prefer
    )
else:
    logger.warning("Study results for 'combined_study' not found or optimization failed. "
                   "CombinedEncoderDecoderTrainer will not be initialized.")

In [None]:
if 'gemini' in models:
    gemini_config = config.copy()
    gemini_config.update(studies.get('gemini_study', {}).get('best_params', {}))
    gemini_config['per_device_eval_batch_size'] = 1
    gemini_config['max_grad_norm'] = 1
    
    # Initialize GeminiTrainer if GeminiModel is available
    trainers['gemini'] = GeminiTrainer(
        model=models['gemini'],
        tokenizer=tokenizers['gemini'],
        config=gemini_config,
        accelerator=accelerator,
        batch_size=64  # or your preferred batch size
    )
else:
    logger.warning("Gemini model not found. GeminiTrainer will not be initialized.")

In [None]:
# Perform evaluations
results = {
    'classification': {},
    'translation': {},
    'generation': {},
    'zero_shot': {},
    'code_switch': {},
    'intent_recognition': {}, 
    'slot_filling': {}, 
    'hyperparameter_studies': studies
}

In [None]:
# Train models with the best hyperparameters
for model_name, trainer in trainers.items():
    logger.info(f"Starting training for model: {model_name}")
    
    # Determine the correct dataset keys
    train_dataset_key = 'train'
    eval_dataset_key = 'eval'
    
    # Ensure the required datasets exist
    if train_dataset_key not in datasets['combined_afro_xlmr_llama'] or eval_dataset_key not in datasets['combined_afro_xlmr_llama']:
        logger.error(f"Required datasets not found for {model_name}. Skipping this model.")
        continue

    train_dataset = datasets['combined_afro_xlmr_llama'][train_dataset_key]
    eval_dataset = datasets['combined_afro_xlmr_llama'][eval_dataset_key]

    logger.info(f"Train dataset size: {len(train_dataset)}, Eval dataset size: {len(eval_dataset)}")
    
    try:
        # Train the model
        train_results = trainer.train(train_dataset, eval_dataset)
        
        # Log training results
        logger.info(f"Training completed for {model_name}")
        logger.info(f"Training results: {train_results}")
        
        # Save the trained model
        save_dir = os.path.join(config['model']['output_dir'], model_name)
        os.makedirs(save_dir, exist_ok=True)
        # trainer.save_model(save_dir)
        logger.info(f"Model saved to {save_dir}")
        
        # Evaluate the model
        eval_results = trainer.evaluate(eval_dataset)
        logger.info(f"Evaluation results for {model_name}: {eval_results}")
        
        # You might want to save these results to a file or database
        
    except Exception as e:
        logger.error(f"Error during training or evaluation of {model_name}: {str(e)}")
        logger.error(f"Traceback: {traceback.format_exc()}")
        continue  # Move to the next model if there's an error

logger.info("Training process completed for all models.")

In [None]:
print(datasets['combined_afro_xlmr_llama']['benchmark'][0])

In [None]:
flores_200 = data_loader.load_flores_200_benchmark()
if not flores_200:
    logging.warning("FLORES-200 benchmark data is empty or failed to load")

In [None]:
print(flores_200["devtest"])

In [None]:
# Load both zero-shot and code-switch datasets
experimental_datasets = data_loader.load_experimental_datasets(flores_200)

if not experimental_datasets:
    logging.warning("Experimenta data is empty or failed to load")

In [None]:
# Access zero-shot and code-switch datasets
zero_shot_df = experimental_datasets['zero_shot']
code_switch_df = experimental_datasets['code_switch']

In [None]:
print(zero_shot_df)

In [None]:
print(code_switch_df)

In [None]:
print(trainers.items())

In [None]:
# Evaluation for all models
for model_name, trainer in trainers.items():
    if model_name == 'combined_afro_xlmr_llama':
        encoder=models['afro-xlmr-large']  # Afro-XLMR as encoder
        decoder=models['meta-llama/Llama-2-7b-hf']  # LLaMA as decoder
        encoder_tokenizer=tokenizers['afro-xlmr-large']
        decoder_tokenizer=tokenizers['meta-llama/Llama-2-7b-hf']
        evaluator = evaluators['combined_afro_xlmr_llama']  # Assuming all evaluators are AfriCOMETEvaluator
    else:
        None

    try:
         # Use the FLORES data that's already loaded
        # results['translation'][model_name] = trainer.evaluate_translation(
        #     encoder, 
        #     decoder,
        #     encoder_tokenizer, 
        #     decoder_tokenizer,
        #     flores_200, 
        #     evaluator, 
        #     studies['combined_study'].best_params
        # )
        # result = analyze_model_requirements(encoder, decoder, encoder_tokenizer, decoder_tokenizer)
        # print(result)
            
        # Zero-shot evaluation
        # results['zero_shot'][model_name] = trainer.evaluate_zero_shot(
        #     zero_shot_df, 
        #     zero_shot_classifier, 
        #     studies['combined_study'].best_params
        # )
        
        # Code-switch evaluation
        results['code_switch'][model_name] = trainer.evaluate_code_switch(
            zero_shot_df, 
            code_switch_classifier,
            studies['combined_study'].best_params
        )

        logger.info(f"Completed evaluation for {model_name}")
        
        logger.info(f"Completed evaluation for {model_name}")
    except Exception as e:
        logger.error(f"Error during evaluation of {model_name}: {str(e)}")
        continue  # Move to the next model if there's an error


In [None]:
print(results)

In [None]:
# Summarize and log results
results_summary = summarize_results(results, config)
plot_results(results, config)
log_results_to_mlflow(results, config, best_params)

In [None]:
logger.info("Evaluation complete!")
print("Evaluation completed successfully. Results and visualizations have been saved and logged to MLflow.")

In [None]:
# Display results and visualizations
display(Image(filename=f"{config['model']['output_dir']}/overall_performance_heatmap.png"))

In [None]:
# Display hyperparameter optimization results
for model_name in config['model']['names']:
    print(f"\nHyperparameter Optimization Results for {model_name}:")
    display(Image(filename=f"{config['model']['output_dir']}/{model_name}_hyperparameter_importance.png"))
    display(Image(filename=f"{config['model']['output_dir']}/{model_name}_optimization_history.png"))
    display(Image(filename=f"{config['model']['output_dir']}/{model_name}_parallel_coordinate.png"))
    display(Image(filename=f"{config['model']['output_dir']}/{model_name}_sensitivity_analysis.png"))

In [None]:
print_results_summary(results_summary, best_params)

In [None]:
print("Evaluation notebook execution complete.")