# Generate document-level assessment

In [3]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.57.4
    Uninstalling openai-1.57.4:
      Successfully uninstalled openai-1.57.4
Successfully installed openai-0.28.0


In [2]:
import openai
import os
import json
import pandas as pd

In [5]:
!pip show openai

Name: openai
Version: 0.28.0
Summary: Python client library for the OpenAI API
Home-page: https://github.com/openai/openai-python
Author: OpenAI
Author-email: support@openai.com
License: 
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, requests, tqdm
Required-by: 
^C


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Set up your OpenAI API key
api_key = "example_key"
openai.api_key = api_key

# Define the evaluation prompt
evaluation_prompt = """
Você deve avaliar a resposta de um modelo para a tarefa 1 demandada e fornecer uma pontuação de 0 a 10, junto com uma explicação para a pontuação.
Considere:

1. A aderência ao pedido no prompt original.
2. Os temas serem os mais relevantes.
3. A aderência ao formato solicitado. Seja em escrita e quantidade de tópicos.

Tarefa original:
{original_prompt}

Texto original:
{context}

Resposta do Modelo:
{response}

Qual é a pontuação (0 a 10) e a explicação? Forneça no formato:
{{"score": <pontuação>, "explanation": "<explicação>"}}
"""

def evaluate_response_with_gpt(original_prompt, context, response, model="gpt-4-turbo"):
    """Evaluate a single response using the ChatGPT API."""
    try:
        evaluation_question = evaluation_prompt.format(
            original_prompt=original_prompt,
            context=context,
            response=response
        )
        api_response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "Você é um avaliador expert"},
                {"role": "user", "content": evaluation_question}
            ]
        )
        evaluation_result = api_response['choices'][0]['message']['content'].strip()
        result = json.loads(evaluation_result)  # Validate JSON format
        if "score" in result and "explanation" in result:
            return result
        else:
            return {"score": None, "explanation": "Invalid response format"}
    except Exception as e:
        print(f"Error evaluating response: {e}")
        return {"score": None, "explanation": "Error during evaluation"}

def read_file(file_path):
    """Read and return the contents of a file."""
    with open(file_path, 'r') as file:
        return file.read()

# Original prompt that was given to the models
original_prompt = """
      Queria pedir para você realizar duas tarefas sequencialmente:

      Tarefa 1) Apresentar os tópicos mais importantes desse texto. Limite máximo de 10 tópicos. Os tópicos devem ser de no máximo 5 palavras e devem ser assuntos, não o detalhamento do que foi falado. Liste os tópicos de em tópicos com '-'.
      Tarefa 2) Avaliar pelas perguntas do público se o público teve uma percepção positiva do apresentado. A resposta deve ter 1 palavra: positivo ou negativo.

      Para todas as respostas deve-se começar pelo texto: 'Tarefa x:' e usar tópicos usando '-'
      Não deve-se usar *
    """

# Paths in Colab (use mounted Google Drive or local paths)
original_folder = "/content/drive/MyDrive/Portfolio Projects/Mestrado/Divided_text/qna/"
qwen_folder = "/content/drive/MyDrive/Portfolio Projects/Mestrado/Outputs/qwen/unsupervised/"
llama_folder = "/content/drive/MyDrive/Portfolio Projects/Mestrado/Outputs/llama/unsupervised/"
output_folder = "/content/drive/MyDrive/Portfolio Projects/Mestrado/Outputs/results/judge_chatgpt/contextualized/"
error_file = os.path.join(output_folder, "error.txt")

os.makedirs(output_folder, exist_ok=True)  # Ensure the output folder exists

# Example workflow: Processing all files
files = [f for f in os.listdir(original_folder) if f.endswith('.txt')]
#files = files[:5]

num_files = len(files)
cont = 0
for filename in files:
    name_file = filename.split(".")[0]
    print(f"Processing file {name_file}")
    cont += 1
    print(f"{cont}/{num_files}")

    try:
        # Read the original text and model responses
        original_text = read_file(os.path.join(original_folder, filename))
        qwen_response = read_file(os.path.join(qwen_folder, f"{name_file}.txt_output.txt"))
        llama_response = read_file(os.path.join(llama_folder, f"{name_file}.txt_output.txt"))

        # Evaluate responses
        qwen_result = evaluate_response_with_gpt(original_prompt, original_text, qwen_response)
        llama_result = evaluate_response_with_gpt(original_prompt, original_text, llama_response)

        # Save results
        evaluation_data = {
            "llama": llama_result,
            "qwen": qwen_result
        }
        output_file = os.path.join(output_folder, f"{name_file}_evaluation.json")
        with open(output_file, 'w') as f:
            json.dump(evaluation_data, f, indent=4, ensure_ascii=False)

        print(f"Evaluation for {filename} completed. Results saved.")

    except Exception as e:
        # Log errors
        with open(error_file, 'a') as ef:
            ef.write(f"{filename}\n")
        print(f"Evaluation for {filename} failed. Error logged.")

Processing file prbc-2012-1
1/375
Evaluation for prbc-2012-1.txt completed. Results saved.
Processing file bmgb-2022-2
2/375
Evaluation for bmgb-2022-2.txt completed. Results saved.
Processing file brsr-2013-4
3/375
Evaluation for brsr-2013-4.txt completed. Results saved.
Processing file abcb-2022-3
4/375
Evaluation for abcb-2022-3.txt completed. Results saved.
Processing file brsr-2007-4
5/375
Evaluation for brsr-2007-4.txt completed. Results saved.
Processing file abcb-2010-2
6/375
Evaluation for abcb-2010-2.txt completed. Results saved.
Processing file prbc-2010-2
7/375
Evaluation for prbc-2010-2.txt completed. Results saved.
Processing file bbas-2006-4
8/375
Evaluation for bbas-2006-4.txt completed. Results saved.
Processing file bbas-2019-2
9/375
Evaluation for bbas-2019-2.txt completed. Results saved.
Processing file sanb-2015-3
10/375
Evaluation for sanb-2015-3.txt completed. Results saved.
Processing file itub-2018-4
11/375
Evaluation for itub-2018-4.txt completed. Results save

# Generate model-level assessment

## New

In [None]:
api_key = "example_key"

In [5]:
import os
import json
from collections import defaultdict

def aggregate_model_evaluations(model_folder):
    """Aggregate evaluations for a specific model."""
    aggregated_explanations = []
    aggregated_scores = []

    # Collect all evaluation files for the model
    evaluation_files = [f for f in os.listdir(model_folder) if f.endswith('_evaluation.json')]

    for eval_file in evaluation_files:
        with open(os.path.join(model_folder, eval_file), 'r') as f:
            eval_data = json.load(f)
            if "score" in eval_data and eval_data["score"] is not None:
                aggregated_scores.append(eval_data["score"])
                aggregated_explanations.append(eval_data["explanation"])

    return aggregated_scores, aggregated_explanations


def generate_full_assessment(aggregated_scores, aggregated_explanations, model_name, api_key):
    """Generate a full model assessment using the ChatGPT API."""
    # Prepare aggregated data
    explanations_text = "\n\n".join(aggregated_explanations)
    average_score = sum(aggregated_scores) / len(aggregated_scores)

    # Create evaluation prompt
    assessment_prompt = f"""
    Você deve fornecer uma avaliação geral para o modelo '{model_name}' com base nos dados a seguir:

    - Pontuação média: {average_score:.2f}
    - Explicações agregadas:
    {explanations_text}

    Avalie os seguintes aspectos:
    1. Os pontos mais fortes do modelo.
    2. Os pontos mais fracos do modelo.
    3. Recomendações para melhoria.
    4. Um resumo geral da performance.

    Forneça a resposta no seguinte formato:
    {{
        "strengths": ["<forte1>", "<forte2>", ...],
        "weaknesses": ["<fraqueza1>", "<fraqueza2>", ...],
        "recommendations": ["<recomendação1>", "<recomendação2>", ...],
        "summary": "<resumo>"
    }}
    """

    # Call ChatGPT API
    try:
        import openai
        openai.api_key = api_key
        api_response = openai.ChatCompletion.create(
            model="gpt-4-turbo",
            messages=[
                {"role": "system", "content": "Você é um avaliador expert em modelos de linguagem."},
                {"role": "user", "content": assessment_prompt}
            ]
        )
        assessment_result = api_response['choices'][0]['message']['content'].strip()
        return json.loads(assessment_result)
    except Exception as e:
        print(f"Error generating assessment for {model_name}: {e}")
        return None


# Main Workflow
api_key = "example_key"
models_folders = {
    "llama": "/content/drive/MyDrive/Portfolio Projects/Mestrado/Outputs/results/judge_llama/contextualized/",
    "qwen":  "/content/drive/MyDrive/Portfolio Projects/Mestrado/Outputs/results/judge_qwen/contextualized/"
}
output_folder = "/content/drive/MyDrive/Portfolio Projects/Mestrado/Outputs/results/judge_chatgpt/full_model_assessments/"

os.makedirs(output_folder, exist_ok=True)

# Process each model
for model_name, model_folder in models_folders.items():
    print(f"Processing model: {model_name}")

    # Aggregate evaluations for the model
    aggregated_scores, aggregated_explanations = aggregate_model_evaluations(model_folder)

    # Generate the full assessment
    if aggregated_scores and aggregated_explanations:
        final_assessment = generate_full_assessment(aggregated_scores, aggregated_explanations, model_name, api_key)

        # Save the assessment to a separate file
        if final_assessment:
            assessment_file = os.path.join(output_folder, f"{model_name}_assessment.json")
            with open(assessment_file, 'w') as f:
                json.dump(final_assessment, f, indent=4, ensure_ascii=False)

            print(f"Assessment for {model_name} saved to {assessment_file}.")
        else:
            print(f"Failed to generate assessment for {model_name}.")
    else:
        print(f"No data available for model: {model_name}.")

Processing model: llama
No data available for model: llama.
Processing model: qwen
No data available for model: qwen.


In [22]:
import os
import json
from collections import defaultdict

def aggregate_evaluations(evaluation_folder):
    """Aggregate evaluations from multiple JSON files into model-specific data."""
    aggregated_scores = defaultdict(list)
    aggregated_explanations = defaultdict(list)

    # Iterate through all JSON evaluation files
    evaluation_files = [f for f in os.listdir(evaluation_folder) if f.endswith('.json')]
    for eval_file in evaluation_files:
        file_path = os.path.join(evaluation_folder, eval_file)
        with open(file_path, 'r') as f:
            eval_data = json.load(f)

            # Aggregate data for each model
            for model, evaluation in eval_data.items():
                if "score" in evaluation and evaluation["score"] is not None:
                    aggregated_scores[model].append(evaluation["score"])
                    aggregated_explanations[model].append(evaluation["explanation"])

    return aggregated_scores, aggregated_explanations


def generate_full_assessment(aggregated_scores, aggregated_explanations, model_name, api_key):
    """Generate a full model assessment using the ChatGPT API."""
    explanations_text = "\n\n".join(aggregated_explanations[model_name])
    average_score = sum(aggregated_scores[model_name]) / len(aggregated_scores[model_name])

    # Create evaluation prompt
    assessment_prompt = f"""
    Você deve fornecer uma avaliação geral para o modelo '{model_name}' com base nos dados a seguir:

    - Pontuação média: {average_score:.2f}
    - Explicações agregadas:
    {explanations_text}

    Avalie os seguintes aspectos:
    1. Os pontos mais fortes do modelo.
    2. Os pontos mais fracos do modelo.
    3. Recomendações para melhoria.
    4. Um resumo geral da performance.

    Forneça a resposta no seguinte formato:
    {{
        "strengths": ["<forte1>", "<forte2>", ...],
        "weaknesses": ["<fraqueza1>", "<fraqueza2>", ...],
        "recommendations": ["<recomendação1>", "<recomendação2>", ...],
        "summary": "<resumo>"
    }}
    """

    # Call ChatGPT API
    try:
        import openai
        openai.api_key = api_key
        api_response = openai.ChatCompletion.create(
            model="gpt-4-turbo",
            messages=[
                {"role": "system", "content": "Você é um avaliador expert em modelos de linguagem."},
                {"role": "user", "content": assessment_prompt}
            ]
        )
        assessment_result = api_response['choices'][0]['message']['content'].strip()
        return json.loads(assessment_result)
    except Exception as e:
        print(f"Error generating assessment for {model_name}: {e}")
        return None


# Main Workflow
evaluation_folder = "/content/drive/MyDrive/Portfolio Projects/Mestrado/Outputs/results/judge_chatgpt/contextualized/"
output_folder = "/content/drive/MyDrive/Portfolio Projects/Mestrado/Outputs/results/judge_chatgpt/full_model_assessments/"
api_key = "example_key"

os.makedirs(output_folder, exist_ok=True)

# Aggregate evaluations from all JSON files
aggregated_scores, aggregated_explanations = aggregate_evaluations(evaluation_folder)

# Generate assessments for each model
for model_name in aggregated_scores.keys():
    print(f"Generating full assessment for model: {model_name}")

    # Generate the full assessment
    if aggregated_scores[model_name] and aggregated_explanations[model_name]:
        final_assessment = generate_full_assessment(aggregated_scores, aggregated_explanations, model_name, api_key)

        # Save the assessment to a separate file
        if final_assessment:
            assessment_file = os.path.join(output_folder, f"{model_name}_assessment.json")
            with open(assessment_file, 'w') as f:
                json.dump(final_assessment, f, indent=4, ensure_ascii=False)

            print(f"Assessment for {model_name} saved to {assessment_file}.")
        else:
            print(f"Failed to generate assessment for {model_name}.")
    else:
        print(f"No data available for model: {model_name}.")


Generating full assessment for model: llama
Assessment for llama saved to /content/drive/MyDrive/Portfolio Projects/Mestrado/Outputs/results/judge_chatgpt/full_model_assessments/llama_assessment.json.
Generating full assessment for model: qwen
Assessment for qwen saved to /content/drive/MyDrive/Portfolio Projects/Mestrado/Outputs/results/judge_chatgpt/full_model_assessments/qwen_assessment.json.


In [23]:
file_path = "/content/drive/MyDrive/Portfolio Projects/Mestrado/Outputs/results/judge_chatgpt/full_model_assessments/qwen_assessment.json"
with open(file_path, 'r') as f:
  assessment = json.load(f)
assessment

{'strengths': ["O modelo consegue, em alguns casos, utilizar o formato de listagem com '-' conforme solicitado.",
  'A capacidade de sintetizar informações em tópicos, embora inconsistente, é evidente em algumas das respostas adequadas.',
  'Quando adere ao formato solicitado, o modelo apresenta respostas concisas e diretas.'],
 'weaknesses': ['Inconsistência na precisão e relevância dos tópicos listados em relação ao texto original.',
  'Frequentemente, o modelo excede o limite de cinco palavras por tópico e não segue a estrutura de listagem com hífens de forma consistente.',
  'Os tópicos listados muitas vezes são vagos, genéricos ou não capturam os elementos essenciais do texto original, levando a uma compreensão superficial ou imprecisa do conteúdo discutido.',
  'Falha em adequar-se estritamente às instruções detalhadas, especialmente em manter cada tópico dentro do limite de palavras e em formatar corretamente conforme o pedido.'],
 'recommendations': ['Melhorar o mecanismo de in

## Old

In [None]:
import os
import json
import openai

# Path to the evaluation results folder
output_folder = "/content/drive/MyDrive/Portfolio Projects/Mestrado/Outputs/results/judge_chatgpt/"

# Function to read and aggregate explanations
def aggregate_explanations(folder):
    llama_explanations = []
    qwen_explanations = []

    files = [f for f in os.listdir(folder) if f.endswith('_evaluation.json')]

    for file in files:
        with open(os.path.join(folder, file), 'r') as f:
            data = json.load(f)
            if "llama" in data and "explanation" in data["llama"]:
                llama_explanations.append(data["llama"]["explanation"])
            if "qwen" in data and "explanation" in data["qwen"]:
                qwen_explanations.append(data["qwen"]["explanation"])

    return llama_explanations, qwen_explanations

# Function to summarize explanations
def summarize_explanations(explanations, model_name, api_key):
    openai.api_key = api_key

    prompt = f"""
    You are an expert in evaluating machine learning models. Summarize the following evaluations for the {model_name} model.

    Evaluations:
    {explanations}

    Provide a detailed analysis of the model's characteristics, positive points, and negative points. Be concise but thorough.
    """

    try:
        response = openai.ChatCompletion.create(
            model="gpt-4-turbo",
            messages=[
                {"role": "system", "content": "You are an AI model evaluation expert."},
                {"role": "user", "content": prompt}
            ]
        )
        return response['choices'][0]['message']['content'].strip()
    except Exception as e:
        print(f"Error summarizing explanations: {e}")
        return None

# Aggregate explanations
llama_explanations, qwen_explanations = aggregate_explanations(output_folder)

# Summarize using the API
api_key = "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"
llama_summary = summarize_explanations(" ".join(llama_explanations), "llama", api_key)
qwen_summary = summarize_explanations(" ".join(qwen_explanations), "qwen", api_key)

# Save summaries to file
with open(os.path.join(output_folder, "summary_llama.txt"), 'w') as f:
    f.write(llama_summary)

with open(os.path.join(output_folder, "summary_qwen.txt"), 'w') as f:
    f.write(qwen_summary)

print("Summaries generated and saved successfully.")


Summaries generated and saved successfully.


In [None]:
llama_summary

"### Analysis of the Llama Model's Evaluation Responses\n\n#### Positive Points:\n1. **Clarity in Enumerating Topics:** The model is often clear in listing relevant topics discussed during the teleconferences or events, indicating an ability to identify key themes from the text.\n2. **Structural Organization:** When it works well, the model's responses are organized in a format that separates topics neatly, which could help in scanning and locating information quickly.\n3. **Comprehension of General Content:** The model can grasp general information about the content, such as recognizing the nature of the document being a financial teleconference or an official business presentation.\n\n#### Negative Points:\n1. **Lack of Depth and Detail:** The model's responses are often superficial and fail to delve into the specifics and intricacies of financial discussions, missing out on critical details that were part of the original transcripts.\n2. **Failure to Adhere to Format Specifications:

In [None]:
# Paths to the summary files
llama_summary_path = os.path.join(output_folder, "summary_llama.txt")
qwen_summary_path = os.path.join(output_folder, "summary_qwen.txt")

# Read and print summaries
with open(llama_summary_path, 'r') as f:
    llama_summary = f.read()
    print("Llama Summary:\n")
    print(llama_summary)
    print("\n" + "-" * 50 + "\n")

with open(qwen_summary_path, 'r') as f:
    qwen_summary = f.read()
    print("Qwen Summary:\n")
    print(qwen_summary)
    print("\n" + "-" * 50 + "\n")

In [None]:
print(llama_summary)

### Analysis of the Llama Model's Evaluation Responses

#### Positive Points:
1. **Clarity in Enumerating Topics:** The model is often clear in listing relevant topics discussed during the teleconferences or events, indicating an ability to identify key themes from the text.
2. **Structural Organization:** When it works well, the model's responses are organized in a format that separates topics neatly, which could help in scanning and locating information quickly.
3. **Comprehension of General Content:** The model can grasp general information about the content, such as recognizing the nature of the document being a financial teleconference or an official business presentation.

#### Negative Points:
1. **Lack of Depth and Detail:** The model's responses are often superficial and fail to delve into the specifics and intricacies of financial discussions, missing out on critical details that were part of the original transcripts.
2. **Failure to Adhere to Format Specifications:** The mod

In [None]:
print(qwen_summary)

**Análise Detalhada da Resposta do Modelo:**

**Características Gerais:**
- A resposta do modelo tenta abordar uma série de tópicos genericamente relacionados a uma teleconferência de resultados financeiros.
- A estrutura da resposta é fragmentada, apresentando listagem de tópicos sem desenvolvimento substancial ou contextualização.
- Falta de especificidade e profundidade analítica nos tópicos mencionados.

**Pontos Positivos:**
- O modelo consegue identificar e listar alguns elementos comuns discutidos em teleconferências de resultados como estratégias de negócios, discussões financeiras e Q&A (perguntas e respostas).
- Mantém um nível básico de clareza ao enunciar categorias ou temas genéricos que poderiam ser discutidos durante uma conferência.

**Pontos Negativos:**
- **Falta de Profundidade:** A resposta não entra em detalhes sobre as discussões financeiras específicas, estratégias, dados econômicos ou respostas específicas às perguntas feitas durante a teleconferência. Não há an