In [2]:
import os
import pandas as pd
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from rouge_score import rouge_scorer
from dotenv import load_dotenv

# Import Bedrock libraries (assumed SDK available)
from boto3 import client, session
import json
load_dotenv()

True

In [3]:
class LLMComparisonFramework:
    def __init__(self, models, dataset, max_summary_length=100):
        print("Initializing the framework with models and dataset.")
        self.models = models  # List of model names
        self.dataset = dataset  # Dataset as a DataFrame with 'text' and 'reference_summary'
        self.max_summary_length = max_summary_length
        self.results = pd.DataFrame(columns=['Model', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L'])
        self.bedrock_client = self.initialize_bedrock_client()

    def initialize_bedrock_client(self):
        print("Initializing AWS Bedrock client.")
        try:
            aws_session = session.Session()
            client = aws_session.client(
                'bedrock-runtime',
                region_name=os.getenv("AWS_REGION", "us-east-1"),
                aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
                aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY")
            )
            print("AWS Bedrock client initialized successfully.")
            return client
        except Exception as e:
            print(f"Failed to initialize AWS Bedrock client: {e}")
            raise

    def load_model(self, model_name):
        print(f"Loading model: {model_name}")
        if model_name.startswith("bedrock:"):
            return None, model_name  # Bedrock models do not use tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        return tokenizer, model

    def generate_summary(self, model, tokenizer, text):
        print("Generating summary for a given text.")
        if isinstance(model, str) and model.startswith("bedrock:"):
            return self.generate_bedrock_summary(model, text)

        inputs = tokenizer.encode(text, return_tensors="pt", truncation=True)
        outputs = model.generate(inputs, max_length=self.max_summary_length, num_beams=4)
        return tokenizer.decode(outputs[0], skip_special_tokens=True)

    def client(self, service_name):
        print(f"Creating client for service: {service_name}")
        import boto3
        return boto3.client(service_name)

    def generate_bedrock_summary(self, model_name, text):
        print(f"Generating summary using Amazon Bedrock model: {model_name}")
        model_id = model_name.split(":")[1]  # Extract Bedrock model ID
        payload = {
            "inputText": text,
                    "textGenerationConfig": {
                        "maxTokenCount": 4096,
                        "stopSequences": [],
                        "temperature": 0,
                        "topP": 1
                    }
                    }
        try:
            response = self.bedrock_client.invoke_model(
            modelId=model_id,
            contentType="application/json",
            accept="application/json",
            body=json.dumps(payload)
        )
            print("Successfully generated summary using Bedrock.")
            return response["body"].read().decode("utf-8")
        except Exception as e:
            print(f"Error generating summary with Bedrock model {model_name}: {e}")
            raise

    def calculate_rouge(self, predicted, reference):
        print("Calculating ROUGE scores.")
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(reference, predicted)
        return {
            'ROUGE-1': scores['rouge1'].fmeasure,
            'ROUGE-2': scores['rouge2'].fmeasure,
            'ROUGE-L': scores['rougeL'].fmeasure
        }

    def evaluate_model(self, model_name):
        print(f"Evaluating model: {model_name}")
        tokenizer, model = self.load_model(model_name)
        rouge1_scores, rouge2_scores, rougeL_scores = [], [], []

        for _, row in self.dataset.iterrows():
            text = row['text']
            reference_summary = row['reference_summary']

            predicted_summary = self.generate_summary(model, tokenizer, text)
            rouge_scores = self.calculate_rouge(predicted_summary, reference_summary)

            rouge1_scores.append(rouge_scores['ROUGE-1'])
            rouge2_scores.append(rouge_scores['ROUGE-2'])
            rougeL_scores.append(rouge_scores['ROUGE-L'])

        new_result = {
            'Model': model_name,
            'ROUGE-1': sum(rouge1_scores) / len(rouge1_scores),
            'ROUGE-2': sum(rouge2_scores) / len(rouge2_scores),
            'ROUGE-L': sum(rougeL_scores) / len(rougeL_scores)
        }
        self.results = pd.concat([self.results, pd.DataFrame([new_result])], ignore_index=True)

    def evaluate_all_models(self):
        print("Evaluating all models on the dataset.")
        for model_name in self.models:
            print(f"Evaluating {model_name}...")
            self.evaluate_model(model_name)

    def visualize_results(self):
        print("Visualizing ROUGE scores for all models.")
        print("Visualization is not available as matplotlib is not installed.")
        print(self.results)

    def deploy_best_model(self):
        print("Selecting and deploying the best model based on ROUGE-L score.")
        best_model_name = self.results.loc[self.results['ROUGE-L'].idxmax(), 'Model']
        print(f"Deploying best model: {best_model_name}")
        return self.load_model(best_model_name)

# Example Usage
if __name__ == "__main__":
    # Example dataset with real-world text and summaries (e.g., news articles)
    print("Creating example dataset.")
    data = {
        'text': [
            "OpenAI has announced a new AI model that performs natural language processing tasks more efficiently.",
            "NASA's Perseverance rover has collected new samples that provide insights into the possibility of past life on Mars."
        ],
        'reference_summary': [
            "OpenAI introduces a new efficient AI model for NLP.",
            "NASA's rover collects samples hinting at past life on Mars."
        ]
    }
    dataset = pd.DataFrame(data)

    # Models to compare
    print("Defining models to compare.")
    models = ["facebook/bart-large-cnn", "google/pegasus-xsum", "bedrock:amazon.titan-text-lite-v1"]

    # Initialize framework
    print("Initializing LLMComparisonFramework.")
    framework = LLMComparisonFramework(models, dataset)

    # Evaluate models
    print("Starting evaluation of all models.")
    framework.evaluate_all_models()

    # Visualize results
    print("Visualizing evaluation results.")
    framework.visualize_results()

    # Deploy the best model
    # print("Deploying the best model.")
    # framework.deploy_best_model()

Creating example dataset.
Defining models to compare.
Initializing LLMComparisonFramework.
Initializing the framework with models and dataset.
Initializing AWS Bedrock client.
AWS Bedrock client initialized successfully.
Starting evaluation of all models.
Evaluating all models on the dataset.
Evaluating facebook/bart-large-cnn...
Evaluating model: facebook/bart-large-cnn
Loading model: facebook/bart-large-cnn


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Generating summary for a given text.
Calculating ROUGE scores.
Generating summary for a given text.
Calculating ROUGE scores.
Evaluating google/pegasus-xsum...
Evaluating model: google/pegasus-xsum
Loading model: google/pegasus-xsum


  self.results = pd.concat([self.results, pd.DataFrame([new_result])], ignore_index=True)
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating summary for a given text.
Calculating ROUGE scores.
Generating summary for a given text.
Calculating ROUGE scores.
Evaluating bedrock:amazon.titan-text-lite-v1...
Evaluating model: bedrock:amazon.titan-text-lite-v1
Loading model: bedrock:amazon.titan-text-lite-v1
Generating summary for a given text.
Generating summary using Amazon Bedrock model: bedrock:amazon.titan-text-lite-v1
Successfully generated summary using Bedrock.
Calculating ROUGE scores.
Generating summary for a given text.
Generating summary using Amazon Bedrock model: bedrock:amazon.titan-text-lite-v1
Successfully generated summary using Bedrock.
Calculating ROUGE scores.
Visualizing evaluation results.
Visualizing ROUGE scores for all models.
Visualization is not available as matplotlib is not installed.
                               Model   ROUGE-1   ROUGE-2   ROUGE-L
0            facebook/bart-large-cnn  0.268231  0.086562  0.234596
1                google/pegasus-xsum  0.253968  0.000000  0.216931
2  bedro