In [None]:
# Import necessary libraries and modules
import os
os.environ['GIT_PYTHON_REFRESH'] = 'quiet'  # Suppress Git warnings
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"  # Enable CUDA launch blocking for debugging
os.environ['TORCH_USE_CUDA_DSA'] = "1"  # Enable CUDA device-side assertions
os.environ['MLFLOW_FLATTEN_PARAMS'] = 'true' # Flatten parameters parameters for logging
os.environ['WANDB_DISABLED'] = 'true'

import sys
sys.path.append('../py')  # Add the parent directory to the Python path
import gc
import torch
print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"Current GPU memory usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import mlflow
import optuna
import json
from accelerate import Accelerator
from IPython.display import Image, display

from dataset_loader import DatasetLoader
from utils import get_model, set_seed, load_config, get_device, CustomDataset
from models.llama2_decoder import Llama2Decoder  # Import Llama2Decoder model
from models.ernie_m import ErnieM  # Import ErnieM model
from evaluators.africomet_evaluator import AfriCOMETEvaluator
from trainers.encoder_decoder_trainer import EncoderDecoderTrainer
from trainers.combined_encoder_decoder_trainer import CombinedEncoderDecoderTrainer
from hyperparameter_analysis import (plot_hyperparameter_importance, plot_optimization_history,
                                     plot_parallel_coordinate, analyze_hyperparameter_sensitivity,
                                     plot_sensitivity_analysis)

In [2]:
def setup_logging(config):
    """
    Set up logging configuration based on the provided config.
    
    This function initializes the logging system with the specified log level,
    format, and output file from the configuration.

    Args:
        config (dict): A dictionary containing logging configuration.

    Returns:
        logging.Logger: Configured logger object.
    """
    logging.basicConfig(
        level=getattr(logging, config['logging']['log_level']),
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        filename=config['logging']['log_file']
    )
    return logging.getLogger(__name__)

In [3]:
def clear_memory():
    """
    Clear unused memory to prevent out-of-memory errors.
    
    This function uses Python's garbage collector and PyTorch's CUDA memory 
    cache clearing (if available) to free up memory.
    """
    gc.collect()
    if torch.cuda.is_available():
        try:
            torch.cuda.empty_cache()
            logging.info("CUDA cache cleared successfully")
        except RuntimeError as e:
            logging.warning(f"Failed to clear CUDA cache: {str(e)}")
    logging.info("Memory cleared")

In [4]:
def load_datasets(data_loader):
    """
    Load datasets using the provided DatasetLoader object.
    
    This function attempts to load all datasets specified in the configuration
    using the DatasetLoader. It includes error handling for common issues.

    Args:
        data_loader (DatasetLoader): An instance of the DatasetLoader class.

    Returns:
        dict or None: A dictionary of loaded datasets, or None if loading fails.
    """
    try:
        logging.info("Loading and preparing datasets...")
        return data_loader.load_datasets()
    except Exception as e:
        logging.error(f"Error loading datasets: {str(e)}")
        return None

In [5]:
def objective_combined(trial, encoder, decoder, tokenizer, train_dataset, eval_dataset, config, evaluator):
    """
    Objective function for hyperparameter optimization of the combined Afro-XLMR and LLaMA model.

    This function is called by Optuna for each trial to evaluate the performance of the model 
    with different hyperparameter settings.

    Args:
        trial (optuna.trial.Trial): The Optuna trial object used for hyperparameter suggestions.
        encoder (torch.nn.Module): The Afro-XLMR encoder model to be trained and evaluated.
        decoder (torch.nn.Module): The LLaMA decoder model to be trained and evaluated.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer used for processing text.
        train_dataset (Dataset): The dataset used for training the model.
        eval_dataset (Dataset): The dataset used for evaluating the model.
        config (dict): The configuration dictionary containing hyperparameter ranges and settings.
        evaluator (Evaluator): The evaluator object used to compute evaluation metrics.

    Returns:
        float: The evaluation metric to be minimized (lower is better).
    """
    # Hyperparameter suggestions
    encoder_lr = trial.suggest_float('encoder_lr', 1e-6, 1e-4, log=True)
    decoder_lr = trial.suggest_float('decoder_lr', 1e-6, 1e-4, log=True)
    num_train_epochs = trial.suggest_int('num_train_epochs', 1, 3)
    per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [1, 2])
    weight_decay = trial.suggest_float('weight_decay', 0.01, 0.1, log=True)
    warmup_steps = trial.suggest_int('warmup_steps', 50, 200)
    gradient_accumulation_steps = trial.suggest_categorical('gradient_accumulation_steps', [1, 2, 4, 8])

    # Update config with trial-suggested hyperparameters
    trial_config = config.copy()
    trial_config.update({
        "encoder_lr": encoder_lr,
        "decoder_lr": decoder_lr,
        "num_train_epochs": num_train_epochs,
        "per_device_train_batch_size": per_device_train_batch_size,
        "per_device_eval_batch_size": per_device_train_batch_size,
        "weight_decay": weight_decay,
        "warmup_steps": warmup_steps,
        "gradient_accumulation_steps": gradient_accumulation_steps,
        "fp16": True,
        "evaluation_strategy": "steps",
        "eval_steps": 200,
        "save_steps": 200,
        "logging_steps": 50,
        "max_grad_norm": 1.0
    })

    logging.info(f"Trial {trial.number}: Starting with hyperparameters: {trial_config}")

    try:
        # Initialize the trainer
        trainer = CombinedEncoderDecoderTrainer(encoder, decoder, tokenizer, trial_config)

        # Log some information about the datasets
        logging.info(f"Train dataset size: {len(train_dataset)}")
        logging.info(f"Eval dataset size: {len(eval_dataset)}")

        # Validate datasets
        logging.info("Validating train dataset:")
        trainer.validate_dataset(train_dataset)
        logging.info("Validating eval dataset:")
        trainer.validate_dataset(eval_dataset)

        # Train the combined model
        train_result = trainer.train(train_dataset, eval_dataset)

        # Evaluate the model
        eval_results = evaluator.evaluate(encoder, decoder, tokenizer, eval_dataset)

        # Ensure evaluation results are valid
        if 'average_score' not in eval_results:
            logging.error("Evaluation results do not contain average_score.")
            return float('inf')  # Return a high metric to minimize

        # Calculate optimization metric (lower is better)
        optimization_metric = 1 - eval_results.get('average_score', 0)
        return optimization_metric

    except RuntimeError as e:
        if "CUDA error: device-side assert triggered" in str(e):
            logging.error(f"CUDA assert error: {str(e)}")
            logging.error(f"Input shapes - encoder: {encoder.config.hidden_size}, decoder: {decoder.config.hidden_size}")
            return float('inf')  # Return a high value to deprioritize this trial
        elif "You can't move a model that has some modules offloaded to cpu or disk" in str(e):
            logging.error(f"Model movement error: {str(e)}")
            return float('inf')  # Return a high value to deprioritize this trial
        else:
            raise
    except Exception as e:
        logging.error(f"Error during objective_combined: {str(e)}")
        logging.error(f"Error details: {type(e).__name__}, {str(e)}")
        logging.error(f"Traceback: {traceback.format_exc()}")
        return float('inf')  # Return a high metric to minimize

    finally:
        torch.cuda.empty_cache()  # Clear CUDA cache after each trial
        logging.info(f"Trial {trial.number}: CUDA cache cleared")

In [6]:
# def objective_combined(trial, encoder, decoder, tokenizer, train_dataset, eval_dataset, config, evaluator):
#     """
#     Objective function for hyperparameter optimization of the combined Afro-XLMR and LLaMA model.
    
#     Args:
#         trial (optuna.trial.Trial): Optuna trial object for hyperparameter suggestions.
#         encoder (torch.nn.Module): The Afro-XLMR encoder model.
#         decoder (torch.nn.Module): The LLaMA decoder model.
#         tokenizer: The tokenizer for the models.
#         train_dataset: Dataset for training.
#         eval_dataset: Dataset for evaluation.
#         config (dict): Configuration dictionary.
#         evaluator: The AfriCOMET evaluator object.

#     Returns:
#         float: The evaluation metric to be minimized.
#     """
#     torch.cuda.empty_cache()  # Clear CUDA cache before each trial
    
#     try:
#         # Hyperparameter suggestions
#         encoder_lr = trial.suggest_float('encoder_lr', 1e-6, 1e-4, log=True)
#         decoder_lr = trial.suggest_float('decoder_lr', 1e-6, 1e-4, log=True)
#         num_train_epochs = trial.suggest_int('num_train_epochs', 1, 2)
#         per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [1, 2])
#         weight_decay = trial.suggest_float('weight_decay', 0.01, 0.1, log=True)
#         warmup_steps = trial.suggest_int('warmup_steps', 50, 200)
#         gradient_accumulation_steps = trial.suggest_categorical('gradient_accumulation_steps', [1, 2, 4, 8])

#         # Update config with trial-suggested hyperparameters
#         config.update({
#             "encoder_lr": encoder_lr,
#             "decoder_lr": decoder_lr,
#             "num_train_epochs": num_train_epochs,
#             "per_device_train_batch_size": per_device_train_batch_size,
#             "per_device_eval_batch_size": per_device_train_batch_size,
#             "weight_decay": weight_decay,
#             "warmup_steps": warmup_steps,
#             "gradient_accumulation_steps": gradient_accumulation_steps,
#             "fp16": True,  # Enable mixed precision training
#             "evaluation_strategy": "steps",
#             "eval_steps": 200,
#             "save_steps": 200,
#             "logging_steps": 50,
#             "max_grad_norm": 1.0,
#         })

#         logging.info(f"Trial {trial.number}: Starting with hyperparameters: {config}")

#         try:
#             # Initialize trainer
#             logging.info(f"Trial {trial.number}: Initializing trainer with encoder hidden size: {encoder.config.hidden_size}, decoder hidden size: {decoder.config.hidden_size}")
#             trainer = CombinedEncoderDecoderTrainer(encoder, decoder, tokenizer, config)
#             logging.info(f"Trial {trial.number}: Trainer initialized successfully")

#             # Validate datasets
#             logging.info(f"Trial {trial.number}: Validating training dataset")
#             trainer.validate_dataset(train_dataset)
#             logging.info(f"Trial {trial.number}: Validating evaluation dataset")
#             trainer.validate_dataset(eval_dataset)

#             # Train the combined model
#             logging.info(f"Trial {trial.number}: Starting training")
#             train_result = trainer.train(train_dataset, eval_dataset)
#             logging.info(f"Trial {trial.number}: Training completed")
            
#             # Evaluate using AfriCOMET
#             logging.info(f"Trial {trial.number}: Starting evaluation")
#             eval_results = evaluator.evaluate(encoder, decoder, tokenizer, eval_dataset)
#             logging.info(f"Trial {trial.number}: Evaluation completed")
            
#             # Calculate optimization metric (lower is better)
#             optimization_metric = 1 - eval_results.get('average_score', 0)
#             logging.info(f"Trial {trial.number}: Optimization metric: {optimization_metric}")
            
#             return optimization_metric

#         except RuntimeError as e:
#             if "CUDA error: device-side assert triggered" in str(e):
#                 logging.error(f"Trial {trial.number}: CUDA assert error: {str(e)}")
#                 logging.error(f"Trial {trial.number}: Input shapes - encoder: {encoder.config.hidden_size}, decoder: {decoder.config.hidden_size}")
#                 return float('inf')  # Return a high value to deprioritize this trial
#             else:
#                 raise

#     except Exception as e:
#         logging.error(f"Trial {trial.number}: Error in objective_combined: {str(e)}")
#         raise optuna.exceptions.TrialPruned()

#     finally:
#         torch.cuda.empty_cache()  # Clear CUDA cache after each trial
#         logging.info(f"Trial {trial.number}: CUDA cache cleared")

In [7]:
def objective_ernie(trial, model, tokenizer, train_dataset, eval_dataset, config, evaluator):
    """
    Optimized objective function for hyperparameter optimization of the ERNIE-M model.

    Args:
        trial (optuna.trial.Trial): Optuna trial object for hyperparameter suggestions.
        model (ErnieM): The ERNIE-M model to be optimized.
        tokenizer: The tokenizer associated with the model.
        train_dataset: Dataset for training.
        eval_dataset: Dataset for evaluation.
        config (dict): Configuration dictionary containing hyperparameter ranges and other settings.
        evaluator: The AfriCOMET evaluator object for computing metrics.

    Returns:
        float: The evaluation metric to be minimized (or float('inf') if an error occurs).
    """
    torch.set_float32_matmul_precision('medium')
    
    patience = 3
    best_metric = float('inf')
    no_improvement_count = 0

    with mlflow.start_run(nested=True):
        try:
            # Suggest hyperparameters
            lr_min = min(float(config['hyperparameters']['learning_rate_min']), float(config['hyperparameters']['learning_rate_max']))
            lr_max = max(float(config['hyperparameters']['learning_rate_min']), float(config['hyperparameters']['learning_rate_max']))
            learning_rate = trial.suggest_float('learning_rate', lr_min, lr_max, log=True)
            num_train_epochs = trial.suggest_int('num_train_epochs', 1, 2)
            per_device_train_batch_size = trial.suggest_categorical('per_device_train_batch_size', [4, 8, 16])
            per_device_eval_batch_size = per_device_train_batch_size
            weight_decay = trial.suggest_float('weight_decay', 0.01, 0.1, log=True)
            warmup_steps = trial.suggest_int('warmup_steps', 50, 200)
            gradient_accumulation_steps = trial.suggest_categorical('gradient_accumulation_steps', [1, 2, 4])

            # Log the hyperparameters to MLflow
            mlflow.log_params({
                "learning_rate": learning_rate,
                "num_train_epochs": num_train_epochs,
                "per_device_train_batch_size": per_device_train_batch_size,
                "per_device_eval_batch_size": per_device_eval_batch_size,
                "weight_decay": weight_decay,
                "warmup_steps": warmup_steps,
                "gradient_accumulation_steps": gradient_accumulation_steps
            })

            # Update the config with the trial-suggested hyperparameters
            config.update({
                "learning_rate": learning_rate,
                "num_train_epochs": num_train_epochs,
                "per_device_train_batch_size": per_device_train_batch_size,
                "per_device_eval_batch_size": per_device_eval_batch_size,
                "weight_decay": weight_decay,
                "warmup_steps": warmup_steps,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "fp16": True,
                "evaluation_strategy": "steps",
                "eval_steps": 200,
                "save_steps": 200,
                "logging_steps": 50,
                "max_grad_norm": 1.0,
            })

            # Initialize the trainer
            trainer = EncoderDecoderTrainer(model=model, tokenizer=tokenizer, config=config)

            # Log dataset sizes and sample batch
            logging.info(f"Train dataset size: {len(train_dataset)}")
            logging.info(f"Eval dataset size: {len(eval_dataset)}")
            sample_batch = next(iter(torch.utils.data.DataLoader(train_dataset, batch_size=per_device_train_batch_size)))
            logging.info(f"Sample batch keys: {sample_batch.keys()}")
            logging.info(f"Sample input_ids shape: {sample_batch['input_ids'].shape}")
            logging.info(f"Sample labels shape: {sample_batch['labels'].shape}")

            # Train the model
            train_loss, eval_loss = trainer.train(train_dataset, eval_dataset)

            # Log the train and eval losses
            mlflow.log_metric("final_train_loss", train_loss)
            mlflow.log_metric("final_eval_loss", eval_loss)
            
            logging.info(f"Final training loss: {train_loss:.4f}")
            logging.info(f"Final evaluation loss: {eval_loss:.4f}")

            # Begin AfriCOMET evaluation
            logging.info("Starting AfriCOMET evaluation...")
            eval_results = evaluator.evaluate(model, tokenizer, eval_dataset)

            # Log the evaluation results, handling potential NaN values
            for key, value in eval_results.items():
                if isinstance(value, dict):
                    for subkey, subvalue in value.items():
                        if isinstance(subvalue, (int, float)) and not torch.isnan(torch.tensor(subvalue)):
                            mlflow.log_metric(f"{key}_{subkey}", subvalue)
                elif isinstance(value, (int, float)) and not torch.isnan(torch.tensor(value)):
                    mlflow.log_metric(key, value)

            # Calculate the optimization metric
            africomet_score = eval_results.get('average_score', 0)
            optimization_metric = (1 - africomet_score) + eval_loss  # Lower is better
            logging.info(f"Optimization metric (lower is better): {optimization_metric}")

            return optimization_metric

        except ValueError as ve:
            logging.error(f"ValueError in objective_ernie: {str(ve)}")
            logging.error("This might be due to a mismatch in input or label shapes.")
            mlflow.log_metric("failed_due_to_error", 1)
            return float('inf')
        
        except RuntimeError as re:
            logging.error(f"RuntimeError in objective_ernie: {str(re)}")
            logging.error("This might be due to CUDA out of memory. Try reducing batch size or model size.")
            mlflow.log_metric("failed_due_to_error", 1)
            return float('inf')
        
        except Exception as e:
            logging.error(f"An unexpected error occurred in objective_ernie: {str(e)}")
            logging.error(f"Error type: {type(e).__name__}")
            logging.error(f"Error args: {e.args}")
            import traceback
            traceback.print_exc()
            mlflow.log_metric("failed_due_to_error", 1)
            return float('inf')
        
        finally:
            torch.cuda.empty_cache()

In [8]:
def run_combined_optimization(encoder, decoder, tokenizer, datasets, config, evaluator):
    """
    Run hyperparameter optimization for the combined Afro-XLMR and LLaMA model.
    
    Args:
        encoder: The Afro-XLMR encoder model.
        decoder: The LLaMA decoder model.
        tokenizer: The tokenizer for the models.
        datasets: Dictionary containing 'train' and 'eval' datasets.
        config: Configuration dictionary.
        evaluator: AfriCOMET evaluator object.
    
    Returns:
        optuna.Study: The completed Optuna study object.
    """
    mlflow.end_run()  # End any existing runs
    
    logging.info("Starting hyperparameter optimization for combined Afro-XLMR and LLaMA")

    # Log GPU memory information
    total_memory = torch.cuda.get_device_properties(0).total_memory
    available_memory = total_memory - torch.cuda.memory_allocated()
    logging.info(f"Total GPU memory: {total_memory / 1e9:.2f} GB")
    logging.info(f"Available GPU memory before optimization: {available_memory / 1e9:.2f} GB")

    try:
        with mlflow.start_run(run_name="optimization_combined"):
            study = optuna.create_study(direction='minimize')
            study.optimize(
                lambda trial: objective_combined(
                    trial, encoder, decoder, tokenizer,
                    datasets['train'], datasets['eval'],
                    config, evaluator
                ),
                n_trials=config['hyperparameters']['n_trials'],
                timeout=3600,
                catch=(Exception,),
                n_jobs=1
            )

            if study.best_trial:
                best_params = study.best_params
                for param, value in best_params.items():
                    mlflow.log_param(f"best_{param}", value)
                mlflow.log_metric("best_score", study.best_value)
            else:
                logging.warning("No completed trials found.")

    except optuna.exceptions.OptunaError as e:
        logging.error(f"Optuna error during hyperparameter optimization: {str(e)}")
        return None
    except Exception as e:
        logging.error(f"Error during hyperparameter optimization: {str(e)}")
        logging.exception("Exception details:")
        return None

    finally:
        torch.cuda.empty_cache()
        logging.info(f"GPU memory after optimization: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    
    return study

In [9]:
# def run_combined_optimization(encoder, decoder, tokenizer, datasets, config, evaluator):
#     """
#     Run hyperparameter optimization for the combined Afro-XLMR and LLaMA model.
    
#     Args:
#         encoder: The Afro-XLMR encoder model.
#         decoder: The LLaMA decoder model.
#         tokenizer: The tokenizer for the models.
#         datasets: Dictionary containing 'train' and 'eval' datasets.
#         config: Configuration dictionary.
#         evaluator: AfriCOMET evaluator object.
    
#     Returns:
#         optuna.Study: The completed Optuna study object.
#     """
#     mlflow.end_run()  # End any existing runs
    
#     logging.info("Starting hyperparameter optimization for combined Afro-XLMR and LLaMA")

#     # Log GPU memory information
#     total_memory = torch.cuda.get_device_properties(0).total_memory
#     available_memory = total_memory - torch.cuda.memory_allocated()
#     logging.info(f"Total GPU memory: {total_memory/1e9:.2f} GB")
#     logging.info(f"Available GPU memory before optimization: {available_memory/1e9:.2f} GB")

#     try:
#         with mlflow.start_run(run_name="optimization_combined"):
#             study = optuna.create_study(direction='minimize')
#             study.optimize(
#                 lambda trial: objective_combined(
#                     trial, encoder, decoder, tokenizer,
#                     datasets['train'], 
#                     datasets['eval'],  
#                     config, evaluator
#                 ),
#                 n_trials=config['hyperparameters']['n_trials'],
#                 timeout=3600,
#                 catch=(Exception,),
#                 n_jobs=1  # Run trials sequentially to avoid GPU conflicts
#             )

#             best_params = study.best_params
#             for param, value in best_params.items():
#                 mlflow.log_param(f"best_{param}", value)
#             mlflow.log_metric("best_score", study.best_value)

#         logging.info("Completed hyperparameter optimization for combined model")
#         logging.info(f"Best trial: {study.best_trial}")
#         print_best_trial_info(study.best_trial)

#     except optuna.exceptions.TrialPruned:
#         logging.info("Trial was pruned due to a CUDA error.")
#         return None
#     except Exception as e:
#         logging.error(f"Error during hyperparameter optimization: {str(e)}")
#         logging.exception("Exception details:")
#         return None

#     finally:
#         try:
#             torch.cuda.empty_cache()
#             logging.info(f"GPU memory after optimization: {torch.cuda.memory_allocated()/1e9:.2f} GB")
#         except RuntimeError as e:
#             logging.warning(f"Failed to empty CUDA cache: {str(e)}")

#     return study

In [10]:
def run_ernie_optimization(model, tokenizer, datasets, config, evaluator):
    """
    Run hyperparameter optimization for ERNIE-M model.
    
    Args:
        model: The ERNIE-M model.
        tokenizer: The tokenizer for ERNIE-M.
        datasets: Dictionary containing 'train' and 'eval' datasets.
        config: Configuration dictionary.
        evaluator: Evaluator object for ERNIE-M.
    
    Returns:
        optuna.Study: The completed Optuna study object.
    """
    # End any existing MLflow runs
    mlflow.end_run()
    
    logging.info("Starting hyperparameter optimization for ERNIE-M")
    
    # Log GPU memory information
    total_memory = torch.cuda.get_device_properties(0).total_memory
    available_memory = total_memory - torch.cuda.memory_allocated()
    logging.info(f"Total GPU memory: {total_memory/1e9:.2f} GB")
    logging.info(f"Available GPU memory before optimization: {available_memory/1e9:.2f} GB")
    
    try:
        # Start a new MLflow run for this optimization
        with mlflow.start_run(run_name="optimization_ernie_m"):
            # Create an Optuna study object
            study = optuna.create_study(direction='minimize')
            
            # Log dataset sizes and model information
            logging.info(f"Train dataset size: {len(datasets['train'])}")
            logging.info(f"Eval dataset size: {len(datasets['eval'])}")
            logging.info(f"Model type: {type(model).__name__}")
            logging.info(f"Tokenizer type: {type(tokenizer).__name__}")
            
            # Run the optimization
            study.optimize(
                lambda trial: objective_ernie(
                    trial, 
                    model, 
                    tokenizer, 
                    datasets['train'],
                    datasets['eval'], 
                    config, 
                    evaluator
                ),
                n_trials=config['hyperparameters']['n_trials'],
                timeout=3600,
                catch=(Exception,),
                n_jobs=1,  # Run trials sequentially to avoid GPU conflicts
                show_progress_bar=True
            )
            
            # Log the best parameters and score
            best_params = study.best_params
            for param, value in best_params.items():
                mlflow.log_param(f"best_{param}", value)
            mlflow.log_metric("best_score", study.best_value)
            
            # Save the optimization results
            save_results("ernie_m", study, config)
        
        logging.info("Completed hyperparameter optimization for ERNIE-M")
        logging.info(f"Best trial: {study.best_trial}")
        print_best_trial_info(study.best_trial)
    
    except optuna.exceptions.TrialPruned:
        logging.info("Trial was pruned.")
        return None
    except Exception as e:
        logging.error(f"Error during hyperparameter optimization for ERNIE-M: {str(e)}")
        logging.exception("Exception details:")
        return None
    
    finally:
        # Clean up resources
        clear_memory()
        if torch.cuda.is_available():
            model.cpu()
            torch.cuda.empty_cache()
        logging.info(f"GPU memory after optimization: {torch.cuda.memory_allocated()/1e9:.2f} GB")
        logging.info("Memory cleared after ERNIE-M optimization")
    
    return study

In [11]:
def save_results(model_name, study, config):
    """Save the optimization results to disk."""
    output_dir = os.path.join(config['model']['output_dir'], 'optimization_results')
    os.makedirs(output_dir, exist_ok=True)
    result = {
        'best_params': study.best_params,
        'best_value': study.best_value,
        'best_trial': study.best_trial.number
    }
    with open(os.path.join(output_dir, f"{model_name}_optimization_results.json"), 'w') as f:
        json.dump(result, f, indent=2)

In [12]:
def print_best_trial_info(trial):
    """Print detailed information about the best trial."""
    print("\nBest Trial Information:")
    print(f"  Value: {trial.value}")
    print("  Params:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    print(f"  Trial number: {trial.number}")
    print(f"  DateTime start: {trial.datetime_start}")
    print(f"  DateTime complete: {trial.datetime_complete}")

In [13]:
def evaluate_classification(model, dataset, evaluator):
    """
    Evaluate classification performance.
    
    This function evaluates the classification performance of the model on the given dataset.

    Args:
        model (torch.nn.Module): The model to evaluate.
        dataset (Dataset): The dataset to evaluate on.
        evaluator (ClassificationEvaluator): The evaluator to use.

    Returns:
        dict: A dictionary containing evaluation results.
    """
    clear_memory()
    logging.info(f"Evaluating classification performance for {dataset.name}")
    results = evaluator.evaluate(dataset)
    logging.info(f"Classification Report:\n{results['classification_report']}")
    plt.figure(figsize=(10, 8))
    results['confusion_matrix_plot'].savefig(f"{model}/{dataset.name}_confusion_matrix.png")
    plt.close()
    clear_memory()
    return results

In [14]:
def evaluate_translation(model, tokenizer, eval_dataset, evaluator):
    """
    Evaluate translation performance using the existing dataset structure.
    
    Args:
        model (torch.nn.Module): The translation model to evaluate.
        tokenizer: The tokenizer for the model.
        eval_dataset (Dataset): The evaluation dataset.
        evaluator (AfriCOMETEvaluator): The evaluator to use.

    Returns:
        dict: A dictionary containing evaluation results.
    """
    results = {'translations': {}}
    target_languages = ['swh', 'kin', 'lug']  # ISO codes for Swahili, Kinyarwanda, and Luganda
    english_code = 'eng'
    
    # Convert dataset to DataFrame if it's not already
    if not isinstance(eval_dataset, pd.DataFrame):
        if hasattr(eval_dataset, 'data'):
            eval_dataset = eval_dataset.data
        else:
            eval_dataset = pd.DataFrame(eval_dataset)
    
    # Log dataset information
    logging.info(f"Evaluation dataset shape: {eval_dataset.shape}")
    logging.info(f"Languages in dataset: {eval_dataset['language'].unique()}")
    logging.info(f"Splits in dataset: {eval_dataset['split'].unique()}")
    
    for lang in target_languages:
        try:
            eng_texts = eval_dataset[eval_dataset['language'] == english_code]['text'].tolist()
            lang_texts = eval_dataset[eval_dataset['language'] == lang]['text'].tolist()
            
            if not eng_texts or not lang_texts:
                logging.warning(f"No data found for {english_code} to {lang} translation. Skipping.")
                continue
            
            logging.info(f"Translating {len(eng_texts)} sentences from {english_code} to {lang}")
            
            inputs = tokenizer(eng_texts, return_tensors="pt", padding=True, truncation=True, max_length=128).to(model.device)
            with torch.no_grad():
                generated = model.generate(**inputs, max_length=128)
            translations = tokenizer.batch_decode(generated, skip_special_tokens=True)
            
            scores = evaluator.evaluate(eng_texts, translations, lang_texts)
            results['translations'][f'{english_code}_to_{lang}'] = scores
            
            logging.info(f"Translation scores for {english_code} to {lang}: {scores}")
        
        except Exception as e:
            logging.error(f"Error during translation evaluation for language {lang}: {str(e)}")
            results['translations'][f'{english_code}_to_{lang}'] = {'error': str(e)}
    
    # Calculate average score
    valid_scores = [score['average_score'] for score in results['translations'].values() 
                    if isinstance(score, dict) and 'average_score' in score]
    
    if valid_scores:
        results['average_score'] = np.mean(valid_scores)
        logging.info(f"Overall average translation score: {results['average_score']}")
    else:
        results['average_score'] = np.nan
        logging.warning("No valid scores for translations")
    
    return results

In [15]:
def evaluate_generation(model, prompt_texts, evaluator):
    """
    Evaluate text generation performance.
    
    This function evaluates the text generation performance of the model using
    the provided evaluator.

    Args:
        model (torch.nn.Module): The generation model to evaluate.
        prompt_texts (list): A list of prompt texts for generation.
        evaluator (GenerationEvaluator): The evaluator to use.

    Returns:
        dict: A dictionary containing evaluation results.
    """
    clear_memory()
    logging.info("Evaluating text generation performance")
    generated_texts = model.generate(prompt_texts)
    generation_results = evaluator.evaluate(prompt_texts, generated_texts)
    logging.info(f"Average perplexity: {generation_results['average_perplexity']}")
    clear_memory()
    return generation_results

In [16]:
def evaluate_zero_shot(model, tokenizer, dataset, evaluator):
    """
    Evaluate zero-shot performance.
    
    This function evaluates the zero-shot performance of the model on the given dataset.

    Args:
        model (torch.nn.Module): The model to evaluate.
        tokenizer: The tokenizer for the model.
        dataset (Dataset): The dataset to evaluate on.
        evaluator: The evaluator to use.

    Returns:
        dict: A dictionary containing evaluation results.
    """
    clear_memory()
    logging.info("Evaluating zero-shot performance")
    
    inputs = tokenizer(dataset['source_text'].tolist(), return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model.generate(**inputs)
    
    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    results = evaluator.evaluate(dataset['source_text'].tolist(), generated_texts, dataset['target_text'].tolist())
    
    logging.info(f"Zero-shot performance: {results}")
    clear_memory()
    return results

In [17]:
def evaluate_code_switch(model, tokenizer, dataset, evaluator):
    """
    Evaluate code-switch performance.
    
    This function evaluates the performance of the model on code-switched text.

    Args:
        model (torch.nn.Module): The model to evaluate.
        tokenizer: The tokenizer for the model.
        dataset (Dataset): The dataset containing code-switched text.
        evaluator: The evaluator to use.

    Returns:
        dict: A dictionary containing evaluation results.
    """
    clear_memory()
    logging.info("Evaluating code-switch performance")
    
    inputs = tokenizer(dataset['code_switched_text'].tolist(), return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs).logits
    
    predictions = torch.argmax(outputs, dim=-1).tolist()
    
    results = evaluator.evaluate(predictions, dataset['label'].tolist())
    
    logging.info(f"Code-switch performance: {results}")
    clear_memory()
    return results

In [18]:
def summarize_results(results, config):
    """
    Summarize evaluation results.
    
    This function summarizes the results from various evaluation tasks and saves
    them to files.

    Args:
        results (dict): A dictionary containing results from all evaluation tasks.
        config (dict): A dictionary containing configuration information.

    Returns:
        pd.DataFrame: A summary of evaluation results.
    """
    logging.info("Summarizing results...")

    summary = {}
    for model_name in config['model']['names']:
        summary[model_name] = {
            'classification': results['classification'][model_name]['classification_report']['accuracy'],
            'translation': results['translation'][model_name]['average_score'] if results['translation'] else None,
            'generation': results['generation'][model_name]['average_perplexity'] if results['generation'] else None,
            'zero_shot': results['zero_shot'][model_name]['accuracy'] if results['zero_shot'] else None,
            'code_switch': results['code_switch'][model_name]['accuracy'] if results['code_switch'] else None
        }

    summary_df = pd.DataFrame(summary)
    logging.info("Evaluation Results Summary:")
    logging.info(summary_df)

    # Save results
    summary_df.to_csv(f"{config['model']['output_dir']}/evaluation_results_summary.csv")
    
    with open(f"{config['model']['output_dir']}/all_results.txt", 'w') as f:
        f.write(str(results))

    return summary_df

In [19]:
def plot_results(results, config):
    """
    Plot evaluation results.
    
    Args:
        results (dict): A dictionary containing results from all evaluation tasks.
        config (dict): A dictionary containing configuration information.
    """
    for model_name in config['model']['names']:
        plt.figure(figsize=(12, 8))
        
        if model_name in results['classification']:
            sns.heatmap(results['classification'][model_name]['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
            plt.title(f"Confusion Matrix - {model_name}")
            plt.tight_layout()
            plt.savefig(f"{config['model']['output_dir']}/{model_name}_confusion_matrix.png")
            plt.close()
        
        if model_name in results['translation']:
            plt.bar(results['translation'][model_name].keys(), results['translation'][model_name].values())
            plt.title(f"Translation Scores - {model_name}")
            plt.tight_layout()
            plt.savefig(f"{config['model']['output_dir']}/{model_name}_translation_scores.png")
            plt.close()
        
        if model_name in results['generation']:
            plt.hist(results['generation'][model_name]['perplexities'], bins=20)
            plt.title(f"Perplexity Distribution - {model_name}")
            plt.xlabel("Perplexity")
            plt.ylabel("Frequency")
            plt.tight_layout()
            plt.savefig(f"{config['model']['output_dir']}/{model_name}_perplexity_distribution.png")
            plt.close()

    # Plot overall performance comparison
    plt.figure(figsize=(12, 6))
    performance_data = {
        model: {
            'Classification': results['classification'].get(model, {}).get('accuracy', 0),
            'Translation': results['translation'].get(model, {}).get('average_score', 0),
            'Generation': 1 / results['generation'].get(model, {}).get('average_perplexity', 1),  # Inverse of perplexity
            'Zero-shot': results['zero_shot'].get(model, {}).get('accuracy', 0),
            'Code-switch': results['code_switch'].get(model, {}).get('accuracy', 0)
        } for model in config['model']['names']
    }
    df = pd.DataFrame(performance_data).T
    sns.heatmap(df, annot=True, cmap='YlGnBu')
    plt.title("Model Performance Across Tasks")
    plt.tight_layout()
    plt.savefig(f"{config['model']['output_dir']}/overall_performance_heatmap.png")
    plt.close()

In [20]:
def log_results_to_mlflow(results, config, best_params):
    """
    Log results to MLflow.
    
    This function logs the evaluation results, model parameters, and artifacts to MLflow.

    Args:
        results (dict): A dictionary containing results from all evaluation tasks.
        config (dict): A dictionary containing configuration information.
        best_params (dict): A dictionary of the best hyperparameters found during optimization.
    """
    with mlflow.start_run():
        # Log hyperparameters for each model
        for model_name, params in best_params.items():
            for param, value in params.items():
                mlflow.log_param(f"{model_name}_{param}", value)
        
        # Log model information
        for model_name in config['model']['names']:
            mlflow.log_param(f"{model_name}_model", model_name)

        # Log metrics
        for model_name, metrics in results['summary'].items():
            for metric, value in metrics.items():
                if value is not None:
                    mlflow.log_metric(f"{model_name}_{metric}", value)

        # Log artifacts
        mlflow.log_artifact(f"{config['model']['output_dir']}/evaluation_results_summary.csv")
        mlflow.log_artifact(f"{config['model']['output_dir']}/all_results.txt")
        mlflow.log_artifact(f"{config['model']['output_dir']}/overall_performance_heatmap.png")
        
        # Log hyperparameter optimization plots
        for model_name in config['model']['names']:
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_hyperparameter_importance.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_optimization_history.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_parallel_coordinate.png")
            mlflow.log_artifact(f"{config['model']['output_dir']}/{model_name}_sensitivity_analysis.png")

In [21]:
def print_results_summary(results_summary, best_params):
    """
    Print a summary of the evaluation results and best hyperparameters.

    Args:
        results_summary (dict): A dictionary containing summarized results.
        best_params (dict): A dictionary of the best hyperparameters found during optimization.
    """
    print("\n===== EVALUATION RESULTS SUMMARY =====")

    if 'classification' in results_summary:
        print("\nClassification Results:")
        for dataset, metrics in results_summary['classification'].items():
            print(f"\n{dataset}:")
            for metric, value in metrics.items():
                print(f"  {metric}: {value:.4f}")

    if 'translation' in results_summary:
        print("\nTranslation Results:")
        print(f"  FLORES-200 Average AfriCOMET Score (A to B): {results_summary['translation']['a_to_b']['average_score']:.4f}")
        print(f"  FLORES-200 Average AfriCOMET Score (B to A): {results_summary['translation']['b_to_a']['average_score']:.4f}")

    if 'generation' in results_summary:
        print("\nGeneration Results:")
        print(f"  FLORES-200 Average Perplexity: {results_summary['generation']['average_perplexity']:.4f}")

    if 'zero_shot' in results_summary:
        print("\nZero-shot Results:")
        print(f"  Accuracy: {results_summary['zero_shot']['accuracy']:.4f}")

    if 'code_switch' in results_summary:
        print("\nCode-switch Results:")
        print(f"  Accuracy: {results_summary['code_switch']['accuracy']:.4f}")

    print("\nBest Hyperparameters:")
    for param, value in best_params.items():
        print(f"  {param}: {value}")

    print("\n======================================")

In [None]:
# Load configuration
config = load_config('../py/config.yaml')
# Set the device dynamically based on availability
config['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'  # Update device setting
auth_token = config.get("auth_token")
cache_dir = os.path.abspath(config['cache']['dir'])
logger = setup_logging(config)
set_seed(config['seed'])
device = get_device(config['device'])
logger.info(f"Using device: {device}")

In [23]:
# Ensure the cache directory exists
os.makedirs(cache_dir, exist_ok=True)

In [None]:
# Load and prepare datasets
data_loader = DatasetLoader(config)
stratified_datasets = data_loader.prepare_stratified_datasets()

In [None]:
for split, dataset in stratified_datasets.items():
    print(f"{split} dataset size: {len(dataset)}")

In [None]:
# Verify data integrity
if not data_loader.verify_data_integrity(stratified_datasets):
    logger.error("Data integrity check failed. Please review the datasets.")
    sys.exit(1)

In [None]:
# Print dataset information
data_loader.print_dataset_info(stratified_datasets)

In [None]:
# Preprocess datasets
preprocessed_datasets = {
    key: data_loader.preprocess_dataset(dataset)
    for key, dataset in stratified_datasets.items()
}

In [None]:
print(f"Python version: {sys.version}")
print(f"Current working directory: {os.getcwd()}")
print(f"Directory contents: {os.listdir()}")
print(f"Parent directory contents: {os.listdir('..')}")

In [None]:
# Check if the models directory is in the Python path
models_dir = os.path.abspath(os.path.join('..', 'py', 'models'))
if models_dir not in sys.path:
    sys.path.append(models_dir)
    print(f"Added {models_dir} to Python path")

In [None]:
# Print contents of the models directory
print(f"Models directory contents: {os.listdir(models_dir)}")

In [32]:
# Initialize models
models, tokenizers = {}, {}
num_labels = len(preprocessed_datasets['train']['label'].unique())

In [None]:
# Initialize models with the configured parameters and authentication token
for model_name in config['model']['names']:
    print(f"Initializing model: {model_name}")
    print(f"Cache directory: {cache_dir}")
    print(f"Auth token: {auth_token[:5]}...{auth_token[-5:] if auth_token else None}")
    try:
        if model_name == "meta-llama/Llama-2-7b-hf":
            llama_model = Llama2Decoder(model_name, auth_token=auth_token, cache_dir=cache_dir)
            models[model_name] = llama_model.get_model()   # The model is already on the appropriate device
            tokenizers[model_name] = llama_model.get_tokenizer()
        elif model_name == "ernie-m-large":
            ernie_model = ErnieM(num_labels)
            models[model_name] = ernie_model.get_model().to('cuda') # Ensure Ernie M is on GPU
            tokenizers[model_name] = ernie_model.get_tokenizer()
        else:
            model, tokenizer = get_model(model_name, num_labels=num_labels, auth_token=auth_token, cache_dir=cache_dir)
            models[model_name] = model.to('cuda') # Ensure Afro XLMR is on GPU
            tokenizers[model_name] = tokenizer

        if models[model_name] is not None and tokenizers[model_name] is not None:
            print(f"Successfully initialized {model_name}")
        else:
            print(f"Failed to initialize {model_name}")
    except Exception as e:
        print(f"Error initializing {model_name}: {str(e)}")
        raise

In [None]:
print("Available models:", list(models.keys()))
print("Available tokenizers:", list(tokenizers.keys()))

In [None]:
# Create custom datasets for PyTorch
datasets = {}

model_type = 'encoder_decoder'  # New model type for the combined model

# For the combined Afro-XLMR and LLaMA model
combined_tokenizer = tokenizers['afro-xlmr-large']  # Assuming you're using the Afro-XLMR tokenizer for the combined model

logging.info(f"Creating datasets for combined Afro-XLMR and LLaMA model with model type: {model_type}")

# Use the key 'combined_afro_xlmr_llama'
datasets['combined_afro_xlmr_llama'] = {
    'train': CustomDataset(preprocessed_datasets['train'], combined_tokenizer, model_type=model_type),
    'eval': CustomDataset(preprocessed_datasets['eval'], combined_tokenizer, model_type=model_type),
    'benchmark': CustomDataset(preprocessed_datasets['benchmark'], combined_tokenizer, model_type=model_type)
}

# For ERNIE-M
ernie_m_tokenizer = tokenizers['ernie-m-large']

logging.info(f"Creating datasets for ERNIE-M with model type: {model_type}")

datasets['ernie-m-large'] = {
    'train': CustomDataset(preprocessed_datasets['train'], ernie_m_tokenizer, model_type=model_type),
    'eval': CustomDataset(preprocessed_datasets['eval'], ernie_m_tokenizer, model_type=model_type),
    'benchmark': CustomDataset(preprocessed_datasets['benchmark'], ernie_m_tokenizer, model_type=model_type)
}


In [None]:
print(datasets)

In [None]:
print("Available datasets:", datasets.keys())

In [None]:
print("Keys in 'combined_afro_xlmr_llama':", datasets['combined_afro_xlmr_llama'].keys())


In [None]:
print("Train dataset:", datasets['combined_afro_xlmr_llama']['train'])
print("Eval dataset:", datasets['combined_afro_xlmr_llama']['eval'])


In [None]:
# Initialize evaluators
evaluators = {
    'combined_afro_xlmr_llama': AfriCOMETEvaluator(
        model=None,  # AfriCOMETEvaluator doesn't use the model directly
        tokenizer=tokenizers['afro-xlmr-large']
    ),
    'ernie-m-large': AfriCOMETEvaluator(
        model=None,  # AfriCOMETEvaluator doesn't use the model directly
        tokenizer=tokenizers['ernie-m-large']
    )
}

In [None]:
# Run optimization for combined Afro-XLMR and LLaMA
combined_study = run_combined_optimization(
    models['afro-xlmr-large'],  # Encoder
    models['meta-llama/Llama-2-7b-hf'],  # Decoder
    tokenizers['afro-xlmr-large'],  
    datasets['combined_afro_xlmr_llama'],  
    config,
    evaluators['combined_afro_xlmr_llama'] 
)

In [None]:
# Run optimization for ERNIE-M
ernie_study = run_ernie_optimization(
    models['ernie-m-large'], 
    tokenizers['ernie-m-large'], 
    datasets['ernie-m-large'], 
    config, 
    evaluators['ernie-m-large']
)

In [None]:
# Run hyperparameter optimization
try:
    # Consolidate the results into the studies variable
    studies = {
        'combined_study': combined_study,
        'ernie-m-large': ernie_study
    }
    # Extract best parameters
    best_params = {model_name: study.best_params for model_name, study in studies.items()}

    # Log best parameters
    for model_name, params in best_params.items():
        logger.info(f"Best hyperparameters for {model_name}: {params}")
        config['training'][model_name] = params

except Exception as e:
    logger.error(f"Error during or after hyperparameter optimization: {str(e)}")
    logger.exception("Exception details:")
    studies = {}  # Initialize an empty dictionary if optimization failed

In [None]:
# Hyperparameter analysis
logger.info("Performing hyperparameter analysis...")

In [None]:
for model_name, study in studies.items():
    # Plot hyperparameter importance
    importance_fig = plot_hyperparameter_importance(study)
    importance_fig.write_image(f"{config['model']['output_dir']}/{model_name}_hyperparameter_importance.png")

    # Plot optimization history
    history_fig = plot_optimization_history(study)
    history_fig.write_image(f"{config['model']['output_dir']}/{model_name}_optimization_history.png")

    # Plot parallel coordinate
    parallel_fig = plot_parallel_coordinate(study)
    parallel_fig.write_image(f"{config['model']['output_dir']}/{model_name}_parallel_coordinate.png")

    # Analyze and plot sensitivity
    sensitivity = analyze_hyperparameter_sensitivity(study)
    sensitivity_fig = plot_sensitivity_analysis(sensitivity)
    sensitivity_fig.write_image(f"{config['model']['output_dir']}/{model_name}_sensitivity_analysis.png")

    # Print sensitivity analysis results
    print(f"\nHyperparameter Sensitivity Analysis for {model_name}:")
    for param, sens in sensitivity:
        print(f"{param}: {sens:.4f}")

In [None]:
logger.info("Hyperparameter analysis complete. Plots saved in output directory.")

In [None]:
# Initialize trainers with the best hyperparameters from the studies
trainers = {
    # Using CombinedEncoderDecoderTrainer for the encoder-decoder setup
    'combined_afro_xlmr_llama': CombinedEncoderDecoderTrainer(
        encoder=models['afro-xlmr-large'],  # Afro-XLMR as encoder
        decoder=models['meta-llama/Llama-2-7b-hf'],  # LLaMA as decoder
        tokenizer=tokenizers['afro-xlmr-large'],  # Using afro-xlmr tokenizer for both (or adjust as needed)
        config={**config, **studies['combined_afro_xlmr_llama'].best_params}  # Best hyperparameters from the tuning study
    ),
    'ernie-m-large': EncoderDecoderTrainer(
        model=models['ernie-m-large'],  # Single model setup (if it's used as an encoder-decoder internally)
        tokenizer=tokenizers['ernie-m-large'],
        config={**config, **studies['ernie-m-large'].best_params}  # Best hyperparameters for ERNIE-M
    )
}

In [None]:
# Train models with the best hyperparameters
for model_name, trainer in trainers.items():
    logger.info(f"Training model: {model_name}")
    train_dataset = datasets[model_name]['train']
    eval_dataset = datasets[model_name]['eval']
    
    try:
        trainer.train(train_dataset, eval_dataset)
        logger.info(f"Training completed for {model_name}")
    except Exception as e:
        logger.error(f"Error during training of {model_name}: {str(e)}")
        continue  # Move to the next model if there's an error

In [None]:
# Perform evaluations
results = {
    'classification': {},
    'translation': {},
    'generation': {},
    'zero_shot': {},
    'code_switch': {},
    'hyperparameter_studies': studies
}

In [None]:
# Evaluation for all models using AfriCOMETEvaluator
for model_name, model in models.items():
    tokenizer = tokenizers[model_name]
    evaluator = evaluators[model_name]  # Assuming all evaluators are AfriCOMETEvaluator
    
    logger.info(f"Evaluating model: {model_name}")

    try:
        # Assuming all evaluators are AfriCOMETEvaluator
        flores_data = data_loader.load_flores_200_benchmark()
        if flores_data:
            results['translation'][model_name] = evaluate_translation(model, tokenizer, flores_data, evaluator)
        else:
            logger.warning(f"FLORES data not available for model: {model_name}")

        # Zero-shot and code-switch evaluations
        zero_shot_data = datasets[model_name]['benchmark'][datasets[model_name]['benchmark']['split'] == 'zero_shot']
        if not zero_shot_data.empty:
            results['zero_shot'][model_name] = evaluate_zero_shot(model, tokenizer, zero_shot_data, evaluator)
        else:
            logger.info(f"No zero-shot data for model: {model_name}")

        code_switch_data = datasets[model_name]['benchmark'][datasets[model_name]['benchmark']['split'] == 'code_switch']
        if not code_switch_data.empty:
            results['code_switch'][model_name] = evaluate_code_switch(model, tokenizer, code_switch_data, evaluator)
        else:
            logger.info(f"No code-switch data for model: {model_name}")

        logger.info(f"Completed evaluation for {model_name}")
    except Exception as e:
        logger.error(f"Error during evaluation of {model_name}: {str(e)}")
        continue  # Move to the next model if there's an error


In [None]:
# Summarize and log results
results_summary = summarize_results(results, config)
plot_results(results, config)
log_results_to_mlflow(results, config, best_params)

In [None]:
logger.info("Evaluation complete!")
print("Evaluation completed successfully. Results and visualizations have been saved and logged to MLflow.")

In [None]:
# Display results and visualizations
display(Image(filename=f"{config['model']['output_dir']}/overall_performance_heatmap.png"))

In [None]:
# Display hyperparameter optimization results
for model_name in config['model']['names']:
    print(f"\nHyperparameter Optimization Results for {model_name}:")
    display(Image(filename=f"{config['model']['output_dir']}/{model_name}_hyperparameter_importance.png"))
    display(Image(filename=f"{config['model']['output_dir']}/{model_name}_optimization_history.png"))
    display(Image(filename=f"{config['model']['output_dir']}/{model_name}_parallel_coordinate.png"))
    display(Image(filename=f"{config['model']['output_dir']}/{model_name}_sensitivity_analysis.png"))

In [None]:
print_results_summary(results_summary, best_params)

In [None]:
print("Evaluation notebook execution complete.")