In [1]:
%pip install datasets python-dotenv



# Libraries

In [4]:
from datasets import load_dataset, Dataset, DatasetDict
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
import os

# Configs and constants

In [6]:
load_dotenv()

HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
DATASET_NAME = "felipeoes/cocoruta-evaluation"
MODEL_COL = "felipeoes/boto-7b"

# Load data and preprocess data

In [5]:
def read_csv_files(folder_path: Path) -> list[Dataset]:
    """ Read all csv files in a folder and return a list of datasets
    """
    
    datasets = []
    for file in folder_path.glob("*.csv"):
        dataset = pd.read_csv(file)
        dataset = Dataset.from_pandas(dataset)
        
        # add token number to metadata
        dataset.metadata = {"file_name": file.stem.split("_")[1]}
        datasets.append(dataset)
        
    return datasets

def concat_datasets(datasets: list[Dataset]) -> DatasetDict:
    """ Concatenate a list of datasets into a single dataset with multiple splits. Each split will be named according to token number for model output. 
    
    Example: 
    boto-7b_128_tokens_train.csv will be the split "128_tokens"
    """

    dataset_dict = {}
    for dataset in datasets:
        split_name = f"{dataset.metadata['file_name']}_tokens"
        dataset_dict[split_name] = dataset
        
    return DatasetDict(dataset_dict)

def preprocess_text(text: str):
    """ Preprocess text to remove repetition from text and split in END_TOKEN """
    START_RESPONSE = "### Resposta:"
    END_TOKEN = "### Fim"

    """ Get text in between START_RESPONSE and END_TOKEN """
    splits = text.split(START_RESPONSE)
    if len(splits) > 1:
        text = splits[1].split(END_TOKEN)[0].strip()
    else:
        text = text.split(END_TOKEN)[0].strip()
    
    return text 

data = read_csv_files(Path("data"))
data  = concat_datasets(data)

data = data.map(lambda x: {"generated_text": preprocess_text(x[MODEL_COL])})
data


Map: 100%|██████████| 15964/15964 [00:01<00:00, 12254.97 examples/s]
Map: 100%|██████████| 15964/15964 [00:01<00:00, 14838.21 examples/s]
Map: 100%|██████████| 15964/15964 [00:01<00:00, 12724.85 examples/s]
Map: 100%|██████████| 15964/15964 [00:01<00:00, 13468.94 examples/s]


DatasetDict({
    1024_tokens: Dataset({
        features: ['file_name', 'context', 'question', 'answer', 'felipeoes/boto-7b', 'generated_text'],
        num_rows: 15964
    })
    128_tokens: Dataset({
        features: ['file_name', 'context', 'question', 'answer', 'felipeoes/boto-7b', 'generated_text'],
        num_rows: 15964
    })
    256_tokens: Dataset({
        features: ['file_name', 'context', 'question', 'answer', 'felipeoes/boto-7b', 'generated_text'],
        num_rows: 15964
    })
    512_tokens: Dataset({
        features: ['file_name', 'context', 'question', 'answer', 'felipeoes/boto-7b', 'generated_text'],
        num_rows: 15964
    })
})

In [6]:
# check split
data["128_tokens"][0]

{'file_name': 'Decisao_CONAMA_No_005-2006.txt',
 'context': 'MINISTÉRIO DO MEIO AMBIENTE \nCONSELHO NACIONAL DO MEIO AMBIENTE \nDECISÃO No 005, DE 27 DE DEZEMBRO DE 2006 \nO CONSELHO NACIONAL DO MEIO AMBIENTE-CONAMA, no uso das \ncompetências que lhe são conferidas pela Lei no 6.938, de 31 de agosto de 1981, regulamentada pelo \nDecreto no 99.274, de 6 de junho de 1990, e tendo em vista o disposto em seu Regimento Interno, anexo à \nPortaria no 168, de 10 de junho de 2005, e o que consta do Processo no 02000.004695/2006-28, e \nConsiderando o disposto no inciso III do art. 8o da Lei no 6.938, de 1981, que prevê a \ncompetência do Conselho Nacional do Meio Ambiente-CONAMA para decidir, como última instância \nadministrativa em grau de recurso, mediante depósito prévio, sobre os recursos administrativos \ninterpostos aos processos de multas aplicadas pelo Instituto Brasileiro de Meio Ambiente e Recursos \nNaturais Renováveis-IBAMA, decide: \nArt. 1o Homologar de acordo com a decisão da 8

In [7]:
# remove MODEL_COL and upload dataset to huggingface
data = data.remove_columns(MODEL_COL)
data.push_to_hub(DATASET_NAME, token=HUGGINGFACE_TOKEN)

Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 85.24ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.23s/it]
Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 88.70ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.00s/it]
Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 96.48ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.70s/it]
Creating parquet from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 87.56ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.11s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/felipeoes/cocoruta-training/commit/12e93b763b3407443357fc2dad8c6fb6e42403cc', commit_message='Upload dataset', commit_description='', oid='12e93b763b3407443357fc2dad8c6fb6e42403cc', pr_url=None, pr_revision=None, pr_num=None)

In [8]:
# check dataset
new_data = load_dataset(DATASET_NAME)
new_data["128_tokens"][0]

Downloading readme: 100%|██████████| 810/810 [00:00<00:00, 1.04kB/s]
Downloading data: 100%|██████████| 17.4M/17.4M [00:07<00:00, 2.27MB/s]
Downloading data: 100%|██████████| 16.4M/16.4M [00:07<00:00, 2.27MB/s]
Downloading data: 100%|██████████| 17.0M/17.0M [00:06<00:00, 2.44MB/s]
Downloading data: 100%|██████████| 17.3M/17.3M [00:09<00:00, 1.79MB/s]
Generating 1024_tokens split: 100%|██████████| 15964/15964 [00:00<00:00, 122911.25 examples/s]
Generating 128_tokens split: 100%|██████████| 15964/15964 [00:00<00:00, 185623.35 examples/s]
Generating 256_tokens split: 100%|██████████| 15964/15964 [00:00<00:00, 172038.86 examples/s]
Generating 512_tokens split: 100%|██████████| 15964/15964 [00:00<00:00, 171953.59 examples/s]


{'file_name': 'Decisao_CONAMA_No_005-2006.txt',
 'context': 'MINISTÉRIO DO MEIO AMBIENTE \nCONSELHO NACIONAL DO MEIO AMBIENTE \nDECISÃO No 005, DE 27 DE DEZEMBRO DE 2006 \nO CONSELHO NACIONAL DO MEIO AMBIENTE-CONAMA, no uso das \ncompetências que lhe são conferidas pela Lei no 6.938, de 31 de agosto de 1981, regulamentada pelo \nDecreto no 99.274, de 6 de junho de 1990, e tendo em vista o disposto em seu Regimento Interno, anexo à \nPortaria no 168, de 10 de junho de 2005, e o que consta do Processo no 02000.004695/2006-28, e \nConsiderando o disposto no inciso III do art. 8o da Lei no 6.938, de 1981, que prevê a \ncompetência do Conselho Nacional do Meio Ambiente-CONAMA para decidir, como última instância \nadministrativa em grau de recurso, mediante depósito prévio, sobre os recursos administrativos \ninterpostos aos processos de multas aplicadas pelo Instituto Brasileiro de Meio Ambiente e Recursos \nNaturais Renováveis-IBAMA, decide: \nArt. 1o Homologar de acordo com a decisão da 8

: 

In [7]:
# temp download csv of datasets
dataset = load_dataset(DATASET_NAME)
dataset

Generating 1024_tokens split: 100%|██████████| 15964/15964 [00:00<00:00, 110246.66 examples/s]
Generating 128_tokens split: 100%|██████████| 15964/15964 [00:00<00:00, 126582.56 examples/s]
Generating 256_tokens split: 100%|██████████| 15964/15964 [00:00<00:00, 126004.42 examples/s]
Generating 512_tokens split: 100%|██████████| 15964/15964 [00:00<00:00, 128092.89 examples/s]


DatasetDict({
    1024_tokens: Dataset({
        features: ['file_name', 'context', 'question', 'answer', 'generated_text'],
        num_rows: 15964
    })
    128_tokens: Dataset({
        features: ['file_name', 'context', 'question', 'answer', 'generated_text'],
        num_rows: 15964
    })
    256_tokens: Dataset({
        features: ['file_name', 'context', 'question', 'answer', 'generated_text'],
        num_rows: 15964
    })
    512_tokens: Dataset({
        features: ['file_name', 'context', 'question', 'answer', 'generated_text'],
        num_rows: 15964
    })
})

In [8]:
# convert each split to a csv file
for split in dataset.keys():
    dataset[split].to_csv(f"{split}.csv", index=False)
    print(f"Saved {split}.csv")

Creating CSV from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 17.91ba/s]


Saved 1024_tokens.csv


Creating CSV from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 18.33ba/s]


Saved 128_tokens.csv


Creating CSV from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 19.06ba/s]


Saved 256_tokens.csv


Creating CSV from Arrow format: 100%|██████████| 16/16 [00:00<00:00, 17.42ba/s]

Saved 512_tokens.csv



