# GPT-4o Evaluation




Hardware - CPU

# Installations


In [None]:
!pip install openai
!pip install tiktoken



#00 - Google Drive Mount



In [None]:
from google.colab import drive
drive.mount('./gdrive', force_remount=True)

Mounted at ./gdrive


# 01 - Imports



In [None]:
import os
import glob
import re
import numpy as np
import pandas as pd
import json
import openai
import tiktoken

from openai import OpenAI
from google.colab import userdata
from pathlib import Path
from time import sleep

# 02 - Constants


In [None]:
MODEL_NAME = "gtp-4o"

PREPROCESSED_DATA_ROOT_PATH = userdata.get('IA_DATA_PREPROCESSED')
STAGING_AREA_ROOT_PATH = userdata.get('IA_DATA_STAGING')

TEST_DATASETS = {Path(dataset).stem.split("_test")[0]: {"df": pd.read_csv(dataset), "path":Path(dataset).parent} for dataset in glob.glob(f'{PREPROCESSED_DATA_ROOT_PATH}/*/*_test.csv')}
DEMO_DATASETS = {Path(dataset).stem.split("_demo")[0]: {"df": pd.read_csv(dataset), "path":Path(dataset).parent} for dataset in glob.glob(f'{PREPROCESSED_DATA_ROOT_PATH}/*/*_demo.csv')}
RESULT_DATASETS = [Path(dataset).stem.split(f"_{MODEL_NAME}_result_v2")[0] for dataset in glob.glob(f'{PREPROCESSED_DATA_ROOT_PATH}/*/*_{MODEL_NAME}_result_v2.csv')]

OPENAI_AUTH = userdata.get('OPENAI_TOKEN')
ORGANIZATION_ID = userdata.get("OPENAI_ORGANIZATION_ID")
PROJECT_ID = userdata.get("OPENAI_PROJECT_ID")

BASE_INSTRUCTION = """Você deverá realizar a tarefa de Classificação de Sentimento Binária em relação a polaridade de textos escritos no idioma português brasileiro considerando dois possíveis rótulos de saída: 1 para o sentimentos positivos ou -1 para negativos. A saída produzida deverá ser em formato JSON, seguindo o esquema definido entre os marcadores ```.```
{'type': 'object','description': Objeto de saída fornecido pelo classificador após a classificação de sentimento do texto de entrada.', 'properties': {'polaridade': {'type': 'integer','description': 'Polaridade em relação ao sentimento expressado no texto de entrada. Pode assumir 2 valores: [-1, 1]','enum': [-1,1]}},
  'required': ['polaridade']}```Considere os seguintes exemplos para realizar a predição:"""

# 03 - Functions


In [None]:
def generate_sorted_examples(dataframe:pd.DataFrame)->str:
    """
    Generate a string of sorted examples from a DataFrame for sentiment analysis.

    Args:
        dataframe (pd.DataFrame): The DataFrame containing the examples.
                                  It should have columns 'text' and 'label'.

    Returns:
        str: A string containing formatted examples of input text and their corresponding polarities.
    """
    examples = ''
    for i, _ in enumerate(dataframe[:3].index):
        examples = examples + (
        "\n"
        f"Exemplo:\n"
        f"'entrada': '{dataframe['text'][i]}'\n"
        "'saida':{'polaridade': " + f"{dataframe['label'][i]}"+"}"
        "\n"
        f"Exemplo:\n"
        f"'entrada': '{dataframe['text'][i+3]}'\n"
        "'saida':{'polaridade': " + f"{dataframe['label'][i+3]}"+"}")
    return examples


def generate_classification_text(text:str)->str:
    """
    Generate a formatted string for sentiment classification input and output.

    Args:
        text (str): The input text to be classified.

    Returns:
        str: A string formatted for sentiment classification showing the input text.
    """
    classification = (
        "\n"
        f"Classificação de Sentimento:"
        f"'entrada': '{text}'"
        "'saida':")
    return classification


#Open IA Batch management

def upload_batch_file(open_ia_client:openai.OpenAI, file_path:str|Path)->openai.types.file_object.FileObject:
    """
    Upload a batch file to the OpenAI client.

    Args:
        open_ia_client (openai.OpenAI): The OpenAI client instance.
        file_path (str | Path): The path to the file to be uploaded.

    Returns:
        openai.types.file_object.FileObject: The uploaded file object.
    """
    if isinstance(file_path, str):
        file_path = Path(file_path)
    if file_path.exists():
        batch_input_file = open_ia_client.files.create(
            file=open(file_path, "rb"),
            purpose="batch")
        return batch_input_file

def create_batch(open_ia_client:openai.OpenAI, batch_input_file:openai.types.file_object.FileObject)->openai.types.batch.Batch:
    """
    Create a batch from an uploaded file.

    Args:
        open_ia_client (openai.OpenAI): The OpenAI client instance.
        batch_input_file (openai.types.file_object.FileObject): The uploaded file object.

    Returns:
        openai.types.batch.Batch: The created batch object.
    """
    if isinstance(batch_input_file, openai.types.file_object.FileObject):
        batch = client.batches.create(
            input_file_id=batch_input_file.id,
            endpoint="/v1/chat/completions",
            completion_window="24h")
        return batch

def retrive_batch(open_ia_client:openai.OpenAI, batch: openai.types.batch.Batch)->str:
    """
    Retrieve the details of a batch.

    Args:
        open_ia_client (openai.OpenAI): The OpenAI client instance.
        batch (openai.types.batch.Batch): The batch object to retrieve.

    Returns:
        str: The details of the batch.
    """
    if isinstance(batch, openai.types.batch.Batch):
        batch = open_ia_client.batches.retrieve(batch.id)
        return batch

def get_batches(open_ia_client:openai.OpenAI, limit:int=15)->list:
    """
    Retrieve a list of batches with a specified limit.

    Args:
        open_ia_client (openai.OpenAI): The OpenAI client instance.
        limit (int, optional): The maximum number of batches to retrieve. Defaults to 15.

    Returns:
        list: A list of batch data.
    """
    batches = client.batches.list(limit=limit)
    batches_data = batches.data
    if batches.has_more==True:
        while batches.has_more==True:
            batches = client.batches.list(limit=limit, after=batches.last_id)
            batches_data.extend(batches.data)
    return batches_data

def get_batch_status(batch:openai.types.batch.Batch)->str:
    """
    Get the status of a batch.

    Args:
        batch (openai.types.batch.Batch): The batch object.

    Returns:
        str: The status of the batch.
    """
    if isinstance(batch, openai.types.batch.Batch):
        return batch.status

def get_batch_output_file_id(batch:openai.types.batch.Batch)->str:
    """
    Get the output file ID of a batch.

    Args:
        batch (openai.types.batch.Batch): The batch object.

    Returns:
        str: The output file ID of the batch.
    """
    if isinstance(batch, openai.types.batch.Batch):
        return batch.output_file_id

def get_file_content(file_id:str, open_ia_client:openai.OpenAI)->str:
    """
    Get the content of a file by its ID.

    Args:
        file_id (str): The ID of the file.
        open_ia_client (openai.OpenAI): The OpenAI client instance.

    Returns:
        str: The content of the file.
    """
    content = open_ia_client.files.content(file_id)
    content = content.text.split("\n")
    content = [json.loads(line) for line in content if line != '']
    return content

def get_response_text(response:json)->str:
    """
    Get the response text from a response dictionary.

    Args:
        response (dict): The response dictionary.

    Returns:
        str: The response text.
    """
    if isinstance(response, dict):
        return response['response']['body']['choices'][0]['message']['content']
    else:
        return None

def get_response_usage(response:json)->str:
    """
    Get the response usage from a response dictionary.

    Args:
        response (dict): The response dictionary.

    Returns:
        str: The response usage.
    """
    if isinstance(response, dict):
        return response['response']['body']['usage']
    else:
        return None

# 04 - Execution

In [None]:
client = OpenAI(
  organization=ORGANIZATION_ID,
  project=PROJECT_ID,
  api_key = OPENAI_AUTH
)

## Batches

In order to reduce the cost of API calls, we will use [OpenAI's batch API](https://platform.openai.com/docs/guides/batch). The first stage consist in creating the batch files for each dataset.

### Genearting batch files

In [None]:
for key, dataset in TEST_DATASETS.items():
    if key not in RESULT_DATASETS:
        examples = generate_sorted_examples(DEMO_DATASETS[key]['df'])
        instruction = BASE_INSTRUCTION + examples
        batch_list = []
        n_tokens = 0
        batch_number = 1

        for index, item in enumerate(dataset['df']['text']):
            classification_text = generate_classification_text(item)
            n_tokens += len(tiktoken.encoding_for_model("gpt-4o").encode(instruction + classification_text))

            batch = {"custom_id":f"request_{key}_{index}",
                     "method": "POST",
                     "url": "/v1/chat/completions",
                     "body": {
                          "model": "gpt-4o",
                          "messages": [
                              {"role": "system", "content": instruction},
                               {"role":"user", "content":classification_text}
                              ],
                          "max_tokens":20,
                          "n":1,
                          "seed":4,
                          "temperature":0,
                          "response_format":{"type": "json_object"}
                          }
                     }

            batch_list.append(batch)

        if n_tokens < 80_000:
            with open(f'{key}_batch{batch_number}_list.jsonl', 'w') as f:
                for item in batch_list:
                    f.write(json.dumps(item) + "\n")
        else:
            temp_batch_list = batch_list.copy()
            while len(temp_batch_list) > 0:
                n_tokens = 0
                temp_dict = {}
                temp_dict[f"batch-{batch_number}"] = []
                while n_tokens < 80_000:
                    for batch in temp_batch_list:
                        text = batch['body']['messages'][0]['content']+batch['body']['messages'][1]['content']
                        current_tokens = len(tiktoken.encoding_for_model("gpt-4o").encode(text))
                        if n_tokens + current_tokens < 80_000:
                            temp_dict[f"batch-{batch_number}"].append(batch) if batch not in temp_dict[f"batch-{batch_number}"] else None
                            n_tokens += current_tokens
                        else:
                            n_tokens = 80_000
                            break

                _ = [temp_batch_list.remove(batch) for batch in temp_dict[f"batch-{batch_number}"]]

                if not os.path.exists(f'content/batches/{key}'):
                    os.makedirs(f'content/batches/{key}')

                with open(f'content/batches/{key}/{key}_batch-{"0" + str(batch_number) if batch_number <10 else batch_number}_list.jsonl', 'w') as f:
                    for item in temp_dict[f"batch-{batch_number}"]:
                        f.write(json.dumps(item) + "\n")

                batch_number += 1

After all files were created, they are uploaded through the API.

### Upload batch files

In [None]:
jsonl_paths = {}

for path in Path('/content/content/batches').rglob('*.jsonl'):
    file_name = path.stem
    key = file_name.split("_batch")[0]
    if key not in jsonl_paths.keys():
        jsonl_paths[key] = []
        jsonl_paths[key].append(path)
    else:
        jsonl_paths[key].append(path)

jsonl_paths = {key: sorted(jsonl_paths[key]) for key in jsonl_paths.keys()}

for item in jsonl_paths['imdb-pt']:
    batch_input_file = upload_batch_file(client, item)
    batch = create_batch(client, batch_input_file)
    while batch.status in ['validating', 'in_progress', 'finalizing']:
        sleep(60)
        batch = retrive_batch(client, batch)
    if batch.status != 'completed':
        sleep(120)
        batch = create_batch(client, batch_input_file)
        while batch.status in ['validating', 'in_progress', 'finalizing']:
            sleep(60)
            batch = retrive_batch(client, batch)

Then, the already processed batches are retrieved and sorted according to their original positions

### Retrive batch results

In [None]:
batches = get_batches(open_ia_client=client, limit=100)
batches_completed = [(get_batch_status(batch), get_batch_output_file_id(batch)) for batch in batches if get_batch_status(batch) == "completed"]
batch_contents = [get_file_content(batch_status[1], client) for batch_status in batches_completed]

results = {}
for responses in batch_contents:
    text = []
    usage = []
    id = []
    for response in responses:
        key = response['custom_id'].split("_")[1]
        text.append(get_response_text(response))
        usage.append(get_response_usage(response))
        id.append(int(response['custom_id'].split('_')[-1]))

    if results.get(key) is None:
        results[key] = {}
        results[key]['text'] = text
        results[key]['usage'] = usage
        results[key]['id'] = id

    else:
        results[key]['text'].extend(text)
        results[key]['usage'].extend(usage)
        results[key]['id'].extend(id)

Lastly, the results are organized and persisted as CSV files.

### Persist batches results

In [None]:
for key in results.keys():
    RESULT_DATASETS = [Path(dataset).stem.split(f"_{MODEL_NAME}_result_v2")[0] for dataset in glob.glob(f'{PREPROCESSED_DATA_ROOT_PATH}/*/*_{MODEL_NAME}_result_v2.csv')]
    if key not in RESULT_DATASETS:
        print(f'Processing key: {key}')
        if results[key].get('id'):
            df = pd.DataFrame(results[key], index=results[key]['id'])
            df = df.drop_duplicates(['id'])
            df = df.sort_index()
        else:
            break
        if TEST_DATASETS[key]['df'].shape[0] != df.shape[0]:
            print(f'Datasets do not have same shape')
            break

        TEST_DATASETS[key]['df']['predictions'] = df['text']
        TEST_DATASETS[key]['df']['usage'] = df['usage']
        TEST_DATASETS[key]['df'].to_csv(f'{str(TEST_DATASETS[key]["path"])}/{key}_{MODEL_NAME}_result.csv', index=False)
        print(f'{key} Processed with success')


Processing key: imdb-pt
imdb-pt Processed with success
