In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Installs

In [20]:
%%capture
!pip install deep_translator
!pip install tiktoken

# Imports

In [21]:
import pandas as pd
import json
import re
import itertools
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
import warnings
warnings.filterwarnings("ignore")

# Load Data

## Original

In [22]:
df_o = pd.read_json('https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json')
df_o = df_o.astype(str)
df_o = df_o.reset_index(names='id')
df_o.shape

(52002, 4)

In [23]:
df_inst = df_o[['id','instruction']].rename(columns={'instruction':'instance'})
df_inst['class_type'] = 'ins'
df_inp = df_o[['id', 'input']].rename(columns={'input':'instance'})
df_inp['class_type'] = 'inp'
df_out = df_o[['id', 'output']].rename(columns={'output':'instance'})
df_out['class_type'] = 'out'
df_instances = pd.concat([df_inst, df_inp, df_out])
df_inst.shape, df_inp.shape, df_out.shape, 'Intances Complete:', df_instances.shape

((52002, 3), (52002, 3), (52002, 3), 'Intances Complete:', (156006, 3))

In [24]:
df_instances['id'] = df_instances['id'].astype(str)+"_"+df_instances['class_type']
df_instances.head()

Unnamed: 0,id,instance,class_type
0,0_ins,Give three tips for staying healthy.,ins
1,1_ins,What are the three primary colors?,ins
2,2_ins,Describe the structure of an atom.,ins
3,3_ins,How can we reduce air pollution?,ins
4,4_ins,Describe a time when you had to make a difficult decision.,ins


In [25]:
52002*3 == df_instances.shape[0]

True

## Translated

In [26]:
df_t = pd.read_json('/content/drive/MyDrive/Colab Notebooks/DATASETS/alpaca_data_translated.json')
df_t = df_t.rename(columns={'original_index':'id'})
df_t = df_t.astype({'instruction':str, 'input':str, 'output':str})
df_t.shape

(52002, 7)

#Functions

In [27]:
def code_detector(text: str) -> bool:
  # palavras e opredores mais comuns que encotrei nas linguagens de programação(se colocar mais, da muita sauda que não é código em si)
    # \b significa que essa palavra deve estar sozinha, ou seja, " def " retorna true e " define " retorna false.
    # O sinal | funciona como "OU", ele vai retornar true caso encontre algum dos padrões, para melhorar os resultados, pode ser preciso tirar termos como if, void, function, try, return, entre outras.
    code_pattern = r"""(
        \bdef\b|
        \bclass\b|
        \belse\b|
        \breturn\b|
        \bprint\b|
        \bimport\b|
        \blambda\b|
        \belif\b|
        \basync\b|
        \bfunction\b|
        \bconst\b|
        \bvar\b|
        \bstatic\b|
        \bvoid\b|
        \bloop\b|
        \benum\b|
        \bimpl\b|
        \bcout|
        \bcin|
        \bnamespace\b|
        std|
        printf|
        System\.Out\.Print|
        System\.Out\.Println|
        \++|
        \--|
        \+=|
        \-=|
        \*=|
        \/=|
        \%=|
        \==|
        \===|
        \!=|
        \!==|
        \>=|
        \<=|
        \&&|
        \||
    )"""

    return bool(re.search(code_pattern, str(text)))

def contains_image_or_link(text):
    image_link_pattern = r"\b\w+\.(png|jpg|jpeg|com)\b"
    return bool(re.search(image_link_pattern, text, re.IGNORECASE))

def number_detector(text:str):
    padrao_numerico = r"""
    (
      \[\s*(\d+(\.\d+)?)(,\s*(\d+(\.\d+)?))+\s*\]                          # Listas com dois ou mais números, ex: [1,2,3, ....]
      |
      \(\d+(,\d+)+\)                                                       # Tuplas com dois ou mais números, ex: (1,2,3, ....)
      |
      \[\s*\[\d+(,\s*\d+)*\](,\s*\[\d+(,\s*\d+)*\])*\s*\]                  # Matrizes numéricas, ex: [[1,2],[3,4], ....]
      |
      \d+[\+\*\=]\d+                                                       # Operações matemáticas sem divisões e subtrações, ex: 1+2=3 (sem / e -)
      |
      \d+[xXyY](\s*[\+\*/=\-]\s*\d+[xXyY]*)*                               # Expressões com variáveis, ex: 5x + 2y
      |
      \b[xXyY]\s*=\s*\d+                                                   # Variáveis como x = 343 ou y = 4
      |
      ^\d+(,\s*\d+)+$                                                      # Sequências de números separados por vírgula, ex: 10, 20, 30, ....
    )
    """
    # re.search pesquisa a string para encontrar qualquer um dos padrões numéricos especificados.
    # re.VERBOSE permite comentários e espaçamento na expressão regular para maior legibilidade.
    if re.search(padrao_numerico, str(text), re.VERBOSE):
        # Verifica se não contem imagens ou link
        if not contains_image_or_link(text):
            # caso não tenha imagem ou link, retorna true
            return True
    # caso tenha imagem ou link, retorna false
    return False

def is_all_numeric(text):
    """
    Verifica se uma string contém apenas números, operadores matemáticos (+, -, *, /, etc.),
    variáveis (letras) ou se é uma lista de expressões válidas.

    Parâmetros:
    text (str): A string a ser verificada.

    Retorna:
    bool: True se a string contém apenas números, operadores matemáticos, variáveis ou listas.
    """
    # Remover espaços em branco extras
    text = text.strip()

    # Verificar se é uma lista (começa com [ e termina com ])
    if text.startswith('[') and text.endswith(']'):
        try:
            # Avaliar se a string é uma lista válida
            eval_list = eval(text)
            if isinstance(eval_list, list):
                # Verificar se todos os elementos da lista são números ou expressões matemáticas válidas
                for item in eval_list:
                    if not is_all_numeric(str(item)):
                        return False
                return True
        except:
            return False

    # Verificar se a string contém palavras (indicando linguagem natural)
    if re.search(r'[a-zA-Z]{2,}', text):
        # Se encontrar palavras com mais de uma letra, é considerado linguagem natural
        return False

    # Verificar se é uma expressão matemática que contém números, variáveis e operadores
    # Permite números, variáveis (letras isoladas), operadores matemáticos, parênteses, igual e espaços
    math_variable_pattern = r'^[\d\s\+\-\*/\(\)\.,=a-zA-Z]*$'

    if re.match(math_variable_pattern, text):
        return True

    return False

# Verify Numbers

In [28]:
df_instances['number_instance'] = df_instances['instance'].apply(lambda x: number_detector(x) if x is not None and pd.notna(x) and len(x) != 0 else False)
df_instances['all_number_instance'] = df_instances['instance'].apply(lambda x: is_all_numeric(x) if x is not None and pd.notna(x) and len(x) != 0 else False)
df_instances.head()

Unnamed: 0,id,instance,class_type,number_instance,all_number_instance
0,0_ins,Give three tips for staying healthy.,ins,False,False
1,1_ins,What are the three primary colors?,ins,False,False
2,2_ins,Describe the structure of an atom.,ins,False,False
3,3_ins,How can we reduce air pollution?,ins,False,False
4,4_ins,Describe a time when you had to make a difficult decision.,ins,False,False


In [29]:
df_instances.groupby("class_type")['all_number_instance'].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
class_type,all_number_instance,Unnamed: 2_level_1
inp,False,51086
inp,True,916
ins,False,52002
out,False,50894
out,True,1108


In [30]:
# all numbers
ins = 0
inp = 517+399
out = 709+399
ins, inp, out

(0, 916, 1108)

In [31]:
# inst 181
# inpu 737
# out 747
df_instances.groupby("class_type")['number_instance'].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
class_type,number_instance,Unnamed: 2_level_1
inp,False,51265
inp,True,737
ins,False,51821
ins,True,181
out,False,51255
out,True,747


# Verify Code

In [32]:
df_instances['contains_code'] = df_instances['instance'].apply(lambda x: code_detector(x) if x is not None and pd.notna(x) and len(x) != 0 else False)
df_instances['contains_code'].value_counts()

Unnamed: 0_level_0,count
contains_code,Unnamed: 1_level_1
False,154997
True,1009


In [33]:
df_instances.groupby("class_type")['contains_code'].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
class_type,contains_code,Unnamed: 2_level_1
inp,False,51849
inp,True,153
ins,False,52000
ins,True,2
out,False,51148
out,True,854


# Emojis and special caracteres

In [34]:
def contains_only_specials_or_emojis(text):
    # Regex para verificar se o texto contém apenas caracteres especiais, espaços ou emojis
    return bool(re.match(r'^[^\w\s]+$', text))

df_instances['only_special_characters'] = df_instances['instance'].apply(contains_only_specials_or_emojis)
df_instances['only_special_characters'].value_counts()

Unnamed: 0_level_0,count
only_special_characters,Unnamed: 1_level_1
False,155995
True,11


In [35]:
df_instances[df_instances['only_special_characters']]['instance'].value_counts()

Unnamed: 0_level_0,count
instance,Unnamed: 1_level_1
+,1
>,1
★★★★★,1
🐥,1
😆,1
😈,1
💃🏻🌃🕺🏼,1
:-),1
.........................,1
···––––···,1


# Verify Images

In [36]:
df_instances.head()

Unnamed: 0,id,instance,class_type,number_instance,all_number_instance,contains_code,only_special_characters
0,0_ins,Give three tips for staying healthy.,ins,False,False,False,False
1,1_ins,What are the three primary colors?,ins,False,False,False,False
2,2_ins,Describe the structure of an atom.,ins,False,False,False,False
3,3_ins,How can we reduce air pollution?,ins,False,False,False,False
4,4_ins,Describe a time when you had to make a difficult decision.,ins,False,False,False,False


In [43]:
def contains_image_or_blob(message):
    # Verifica se "Blob" ou "Image" estão presentes na string
    return any(keyword in message for keyword in ['IPython.display.Image'])#['Blob', 'Image'])

# Criar uma nova coluna 'contains_images' que indica a presença de Blob ou Image
df_instances['contains_images'] = df_instances['instance'].apply(contains_image_or_blob)

In [44]:
df_instances['contains_images'].value_counts()

Unnamed: 0_level_0,count
contains_images,Unnamed: 1_level_1
False,156006


In [40]:
df_instances[df_instances['contains_images']]

Unnamed: 0,id,instance,class_type,number_instance,all_number_instance,contains_code,only_special_characters,contains_images
23566,23566_ins,Imagery - Describe the sound of the waves crashing against the rocks.,ins,False,False,False,False,True
2319,2319_inp,[Image provided],inp,False,False,False,False,True
3763,3763_inp,[Image of a new product],inp,False,False,False,False,True
6407,6407_inp,[Image],inp,False,False,False,False,True
6479,6479_inp,Image: A man working in an office,inp,False,False,False,False,True
...,...,...,...,...,...,...,...,...
45129,45129_out,"<!DOCTYPE html>\n<html>\n <head>\n <title>Product Page</title>\n <style>\n body {\n font-family: Arial, sans-serif;\n }\n .product-page {\n display: flex;\n flex-direction: column;\n align-items: center;\n }\n .product-image img {\n width: 400px;\n height: 400px;\n }\n .product-description {\n text-align: center;\n font-size: 1.4em;\n }\n .product-price {\n font-weight: bold;\n font-size: 1.6em;\n }\n </style>\n </head>\n <body>\n <div class=""product-page"">\n <h1>Product Name</h1>\n <div class=""product-image"">\n <img src=""product.jpg"" alt=""Product Image"">\n </div>\n <div class=""product-description"">\n Product description\n </div>\n <div class=""product-price"">\n $99.99\n </div>\n </div>\n </body>\n</html>",out,False,False,True,False,True
46757,46757_out,Image viewer and editor software,out,False,False,False,False,True
47171,47171_out,[Image of structure],out,False,False,False,False,True
48352,48352_out,"1. Imagery and Metaphors: Animation can help bring visual metaphors to life, allowing audience members to more easily connect with and remember information. \n2. Creating Connections: Animations can be used to bridge dissimilar topics or stories, leading to a more unified presentation. \n3. Engaging Audience: Animation is a great way to capture and retain the attention of an audience, as it helps to make content more captivating and enjoyable to consume.",out,False,False,False,False,True


# Errors in Input and Output

Treatment some expressions like:

[ "No input|Noinput|<no input>|No Input|noinput"]

In [None]:
df_instances.shape[0] == (52002*3)

True

In [None]:
pattern = r"No instruction|Noinstruction|<no instruction>|No Instruction|noinstruction|No input|Noinput|<no input>|No Input|noinput|No output|Nooutput|<no output>|No output|nooutput"

print("Errors:", df_instances[df_instances['instance'].str.contains(pattern, na=False)].shape)

df_instances['instance'] = df_instances['instance'].replace(pattern, "", regex=True)

print("After replacement:", df_instances[df_instances['instance'].str.contains(pattern, na=False)].shape)

Errors: (735, 7)
After replacement: (0, 7)


In [None]:
df_instances.shape[0] == (52002*3)

True

# Verify errors in translator

In [None]:
from deep_translator import GoogleTranslator

def translate_to_portuguese(text):
    try:
        translated_text = GoogleTranslator(source='en', target='pt').translate(text)
        return translated_text, "ok"
    except Exception as e:
        return "error", "error"

def fix_translator(df, coluna_texto, coluna_status):
    """
    Função para corrigir traduções em uma DataFrame.

    Parâmetros:
    df: DataFrame - O dataframe contendo as colunas de texto e status.
    coluna_texto: str - Nome da coluna que contém o texto para ser corrigido.
    coluna_status: str - Nome da coluna que contém o status ('error' ou outro).

    Retorna:
    df - DataFrame com as traduções corrigidas e o status atualizado.
    """

    erros = df[df[coluna_status] == 'error']

    for idx, row in erros.iterrows():
        original_text = row[coluna_texto]
        translated_text, status = translate_to_portuguese(original_text)
        df.at[idx, coluna_texto] = translated_text
        df.at[idx, coluna_status] = status

    return df

df_t['status_instruction'].value_counts(), df_t['status_input'].value_counts(), df_t['status_output'].value_counts()

(status_instruction
 ok       51999
 error        3
 Name: count, dtype: int64,
 status_input
 skip     31323
 ok       20673
 error        6
 Name: count, dtype: int64,
 status_output
 ok       51971
 skip        28
 error        3
 Name: count, dtype: int64)

In [None]:
df_t = fix_translator(df_t, 'instruction', 'status_instruction')
df_t = fix_translator(df_t, 'input', 'status_input')
df_t = fix_translator(df_t, 'output', 'status_output')

df_t[['status_instruction', 'status_input', 'status_output']].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
status_instruction,status_input,status_output,Unnamed: 3_level_1
ok,skip,ok,31311
ok,ok,ok,20663
ok,ok,skip,16
ok,skip,skip,12


In [None]:
df_translated = df_t[['id', 'instruction', 'input', 'output']].copy()

df_inst_t = df_translated[['id','instruction']].rename(columns={'instruction':'instance'})
df_inst_t['class_type'] = 'ins'

df_inp_t = df_translated[['id', 'input']].rename(columns={'input':'instance'})
df_inp_t['class_type'] = 'inp'

df_out_t = df_translated[['id', 'output']].rename(columns={'output':'instance'})
df_out_t['class_type'] = 'out'

df_instances_translated = pd.concat([df_inst_t, df_inp_t, df_out_t])

df_instances.shape[0] == df_instances_translated.shape[0], df_inst_t.shape, df_inp_t.shape, df_out_t.shape, 'Intances Complete:', df_instances_translated.shape,

(True, (52002, 3), (52002, 3), (52002, 3), 'Intances Complete:', (156006, 3))

In [None]:
"Original:", df_instances.shape[0] == (52002*3),"Translated:",df_instances_translated.shape[0] == (52002*3)

('Original:', True, 'Translated:', True)

In [None]:
df_instances_translated['id'] = df_instances_translated['id'].astype(str)+"_"+df_instances_translated['class_type']
df_instances_translated.head()

Unnamed: 0,id,instance,class_type
0,0_ins,Dê três dicas para se manter saudável.,ins
1,1_ins,Quais são as três cores primárias?,ins
2,2_ins,Descreva a estrutura de um átomo.,ins
3,3_ins,Como podemos reduzir a poluição do ar?,ins
4,4_ins,Descreva uma ocasião em que você teve que tomar uma decisão difícil.,ins


# Organize to GEMINI

- excluir os all_numbers
- excluir os code
- exluir os vazios
- excluir os duplicados
- excluir os que só tem caracteres especiais e emojis

In [None]:
df_instances.head()

Unnamed: 0,id,instance,class_type,number_instance,all_number_instance,contains_code,only_special_characters
0,0_ins,Give three tips for staying healthy.,ins,False,False,False,False
1,1_ins,What are the three primary colors?,ins,False,False,False,False
2,2_ins,Describe the structure of an atom.,ins,False,False,False,False
3,3_ins,How can we reduce air pollution?,ins,False,False,False,False
4,4_ins,Describe a time when you had to make a difficult decision.,ins,False,False,False,False


In [None]:
print("All numbers:", df_instances[df_instances['all_number_instance']].shape)
print("Code:", df_instances[df_instances['contains_code']].shape)
print("Void:", df_instances[
    (df_instances['instance'] == "") |
    df_instances['instance'].isnull() |
    (df_instances['instance'].str.strip() == "")
].shape)

print("Duplicated:", df_instances[df_instances['instance'].duplicated(keep=False)].shape)
print("Emojis and caracteres:",  df_instances[df_instances['only_special_characters']].shape)

All numbers: (2024, 7)
Code: (1009, 7)
Void: (31779, 7)
Duplicated: (35228, 7)
Emojis and caracteres: (11, 7)


In [None]:
df_voids = df_instances[
    (df_instances['instance'] == "") |
    df_instances['instance'].isnull() |
    (df_instances['instance'].str.strip() == "")]
df_all_numbers = df_instances[df_instances['all_number_instance']]
df_code = df_instances[df_instances['contains_code']]
df_duplicated = df_instances[df_instances['instance'].duplicated(keep=False)]
df_specials = df_instances[df_instances['only_special_characters']]


output_folder = '/content/drive/MyDrive/Colab Notebooks/DATASETS/ALPACA_FILTERED/'
import os
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

df_voids.to_csv(os.path.join(output_folder, 'df_voids.csv'), index=False)
df_all_numbers.to_csv(os.path.join(output_folder, 'df_all_numbers.csv'), index=False)
df_code.to_csv(os.path.join(output_folder, 'df_code.csv'), index=False)
df_duplicated.to_csv(os.path.join(output_folder, 'df_duplicated.csv'), index=False)
df_specials.to_csv(os.path.join(output_folder,'df_specials.csv'), index=False)

In [None]:
"Original:", df_instances.shape[0] == (52002*3),"Translated:",df_instances_translated.shape[0] == (52002*3)

('Original:', True, 'Translated:', True)

Lembrar que essas categorias vão se sobrepor, então tem que fazer os comuns para poder fazer as contas e bater certinho!

In [None]:
print("Shape vs id únicos code: ", df_code.shape[0] == df_code.id.nunique())
print("Shape vs id únicos all_numbers: ", df_all_numbers.shape[0] == df_all_numbers.id.nunique())
print("Shape vs id únicos duplicated: ", df_duplicated.shape[0] == df_duplicated.id.nunique())
print("Shape vs id únicos void: ", df_voids.shape[0] == df_voids.id.nunique())
print("Shape vs id únicos specials", df_specials.shape[0] == df_specials.id.nunique())

Shape vs id únicos code:  True
Shape vs id únicos all_numbers:  True
Shape vs id únicos duplicated:  True
Shape vs id únicos void:  True
Shape vs id únicos specials True


In [None]:
"Sum of shape: ", (df_code.shape[0] + df_all_numbers.shape[0] + df_duplicated.shape[0] + df_voids.shape[0] + df_specials.shape[0]) == (52002*3)
"Ids uniques to not translate:", len(set(df_code['id']) | set(df_all_numbers['id']) | set(df_duplicated['id']) | set(df_voids['id']) | set(df_specials['id']))

('Ids uniques to not translate:', 37588)

In [None]:
ids = df_code['id'].to_list() + df_all_numbers['id'].to_list() + df_duplicated['id'].to_list() + df_voids['id'].to_list() + df_specials['id'].to_list()
len(ids), len(set(ids))

(70051, 37588)

In [None]:
# IDs presentes em cada DataFrame
ids_voids = df_voids['id'].to_list()
ids_all_numbers = df_all_numbers['id'].to_list()
ids_code = df_code['id'].to_list()
ids_duplicated = df_duplicated['id'].to_list()
ids_specials = df_specials['id'].to_list()

#
# Criar um DataFrame para armazenar os resultados das combinações
import itertools

# Lista com todos os conjuntos
sets = {
    'voids': set(ids_voids),
    'all_numbers': set(ids_all_numbers),
    'code': set(ids_code),
    'duplicated': set(ids_duplicated)
}

# Inicializando a estrutura de resultados
combinations_results = []

# Gerar todas as combinações possíveis (1 a n)
for n in range(1, len(sets) + 1):
    for combination in itertools.combinations(sets.keys(), n):
        # Pegar a interseção entre os conjuntos da combinação
        intersection = set.intersection(*(sets[key] for key in combination))
        # Adicionar o resultado da combinação e o tamanho da interseção
        combinations_results.append({
            'combination': ' & '.join(combination),
            'count': len(intersection)
        })

# Criar DataFrame para visualizar o resultado
df_combinations = pd.DataFrame(combinations_results)

# Ordenar pelos maiores resultados de interseção
df_combinations.sort_values(by='count', ascending=False)

Unnamed: 0,combination,count
3,duplicated,35228
0,voids,31779
6,voids & duplicated,31779
1,all_numbers,2024
2,code,1009
8,all_numbers & duplicated,664
9,code & duplicated,18
4,voids & all_numbers,0
5,voids & code,0
7,all_numbers & code,0


In [None]:
ids_to_ignore = ids_voids + ids_all_numbers + ids_code + ids_duplicated + ids_specials
ids_to_ignore = list(set(ids_to_ignore))
len(ids_to_ignore)

37588

In [None]:
df_instances_final = df_instances[~df_instances['id'].isin(ids_to_ignore)]
df_instances_final.shape

(118418, 7)

In [None]:
(118418+37588) == (52002*3)

True

# Filter to no translate to GLOSA

In [None]:
df_instances_final.loc[40298]

Unnamed: 0,id,instance,class_type,number_instance,all_number_instance,contains_code,only_special_characters
40298,40298_ins,Convert this text written in morse code to English text.,ins,False,False,False,False
40298,40298_inp,\-- \-\ \-\ \-- \.- \. \... \-\. \-- \-.,inp,False,False,False,False
40298,40298_out,"This message decodes to ""DOG"".",out,False,False,False,False


In [None]:
df_instances_translated.shape[0] == df_instances.shape[0]

True

In [None]:
df_instances_to_translate = df_instances_translated[~df_instances_translated['id'].isin(ids_to_ignore)]
df_instances_to_translate.shape

(118418, 3)

In [None]:
df_instances_to_translate[df_instances_to_translate.id.isin(['38726_inp',
 '40298_inp',
 '51186_inp',
 '51912_inp',
 '7165_out',
 '9894_out',
 '14160_out',
 '14550_out',
 '49453_out',
 '51186_out'])]

Unnamed: 0,id,instance,class_type
40298,40298_inp,,inp


In [None]:
# excluir esse indice teimoso na mão
df_instances = df_instances_to_translate.drop(index=40298)
df_instances.shape

(118415, 3)

# Estimate Tokens

In [None]:
from transformers import AutoTokenizer
import pandas as pd

# Carregar o tokenizer do modelo GPT-2
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Inicializar uma lista para armazenar erros
error_records = []

# Função para calcular o número de tokens
def count_tokens(text, index):
    if isinstance(text, str):  # Verifica se o texto é uma string
        return len(tokenizer.encode(text, truncation=True))
    else:
        error_records.append({'index': index, 'instance': text})
        return 0

df_instances_to_translate['tokens_est_instance'] = df_instances_to_translate.apply(lambda row: count_tokens(row['instance'], row.name), axis=1)

df_instances_to_translate['tokens_est_id'] = df_instances_to_translate.apply(lambda row: count_tokens(row['id'], row.name), axis=1)

df_instances_to_translate_errors = pd.DataFrame(error_records)

print(df_instances_to_translate_errors)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Empty DataFrame
Columns: []
Index: []


In [None]:
df_instances_to_translate.to_csv('/content/drive/MyDrive/Colab Notebooks/DATASETS/alpaca_to_translate_glosa.csv', index=False)

In [None]:
df_instances_to_translate[['tokens_est_instance', 'tokens_est_id']].sum()

Unnamed: 0,0
tokens_est_instance,7077232
tokens_est_id,491433


In [None]:
df_instances_to_translate[['tokens_est_instance', 'tokens_est_id']].sum().sum()

7568665

# Pricing

In [None]:
price_per_1000_tokens = 0.60
saida = (7568665 / 1000000) * price_per_1000_tokens

price_per_1000_tokens = 0.03
entrada = (7568665 / 1000000) * price_per_1000_tokens

"USD Entrada", entrada, "Saída:", saida, "Cache:"

('USD Entrada', 0.22705995, 'Saída:', 4.541199, 'Cache:')

In [None]:
((306 + 764 ) / 10000000) * (0.0375)

4.0125e-06

In [None]:
(((306 + 764 ) / 10000000) * (0.0375) + entrada + saida ) * 2

9.536525925