In [2]:
from datasets import load_dataset, Dataset, load_from_disk, concatenate_datasets
from MinHashDeduplication import Deduplicate
from RuleBasedFilter import RuleBasedFilter
from TinyBERTFilter import TinyBERTFilter
import json
from tqdm import tqdm
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
raw_datasets = {
        "meta-math/MetaMathQA": ['train', 'query', None, False],
        "LDJnr/Capybara": ['train', 'conversation', 'conversation', False],
        "iamtarun/python_code_instructions_18k_alpaca": ['train', 'instruction', None, False],
        "open-r1/OpenR1-Math-220k": ['train', 'problem', None, False],
        "hkust-nlp/CodeIO-PyEdu-Reasoning": ['train', 'prompt', None, True],
    }

In [None]:
def run_filter(raw_datasets):
    for name, l in tqdm(raw_datasets.items()):
        streaming = l[3]
        if not streaming:
            ds = load_dataset(name, split=l[0])
        else:
            ds = []
            ds_stream = load_dataset(name, split=l[0], streaming=True)
            for i, sample in enumerate(ds_stream):
                if i >= 100000:
                    break
                ds.append(sample)
        
        dd_filter = Deduplicate(ds, l[1])
        ds_dd = dd_filter.run()
        rb_filter = RuleBasedFilter(ds_dd, l[2])
        ds_rb = rb_filter.run()
        ds_cleaned = Dataset.from_list(ds_rb)
        ds_cleaned.save_to_disk(f'./datasets/{name}_cleaned')
        print(f'Dataset {name} cleaning completed. \n')

run_filter(raw_datasets)


  0%|          | 0/5 [00:00<?, ?it/s]

Original dataset size: 395000
Deduplicated dataset size: 50443
Original dataset size: 50443
Filtered dataset size: 50229


Saving the dataset (1/1 shards): 100%|██████████| 50229/50229 [00:00<00:00, 1183790.79 examples/s]
 20%|██        | 1/5 [05:14<20:59, 314.80s/it]

Dataset meta-math/MetaMathQA cleaning completed. 

Original dataset size: 16006
Deduplicated dataset size: 15056
Original dataset size: 15056
Filtered dataset size: 11803


Saving the dataset (1/1 shards): 100%|██████████| 11803/11803 [00:00<00:00, 383215.96 examples/s]
 40%|████      | 2/5 [05:44<07:20, 146.83s/it]

Dataset LDJnr/Capybara cleaning completed. 

Original dataset size: 18612
Deduplicated dataset size: 17520
Original dataset size: 17520
Filtered dataset size: 17468


Saving the dataset (1/1 shards): 100%|██████████| 17468/17468 [00:00<00:00, 881460.34 examples/s]
 60%|██████    | 3/5 [06:04<02:58, 89.14s/it] 

Dataset iamtarun/python_code_instructions_18k_alpaca cleaning completed. 

Original dataset size: 93733
Deduplicated dataset size: 85158
Original dataset size: 85158
Filtered dataset size: 83834


Saving the dataset (9/9 shards): 100%|██████████| 83834/83834 [00:08<00:00, 10154.19 examples/s]
 80%|████████  | 4/5 [12:54<03:35, 215.64s/it]

Dataset open-r1/OpenR1-Math-220k cleaning completed. 

Original dataset size: 100000
Deduplicated dataset size: 62670
Original dataset size: 62670
Filtered dataset size: 62659


Saving the dataset (1/1 shards): 100%|██████████| 62659/62659 [00:00<00:00, 377993.43 examples/s]
100%|██████████| 5/5 [23:42<00:00, 284.40s/it]


Dataset hkust-nlp/CodeIO-PyEdu-Reasoning cleaning completed. 



In [6]:
def dataset_concat(raw_datasets, samp=False):
    ds_list = []
    for name in tqdm(raw_datasets.keys(), desc='Concatnating'):
        if not samp:
            ds_list.append(load_from_disk(f'./datasets/{name}_cleaned'))
        else:
            ds_list.append(load_from_disk(f'./datasets/{name}_cleaned').select(range(2000)))

    ds_concat = concatenate_datasets(ds_list)
    ds_concat = ds_concat.add_column('index', range(len(ds_concat)))

    return ds_concat

samp_to_label = dataset_concat(raw_datasets, samp=True)
samp_to_label.save_to_disk('./datasets/sample_to_label')



Concatnating: 100%|██████████| 5/5 [00:00<00:00, 30.79it/s]
Saving the dataset (1/1 shards): 100%|██████████| 10000/10000 [00:00<00:00, 101271.08 examples/s]


In [5]:
def add_text_or_dataset(sample, add_text=True):
    if sample['query']:
        text = sample['query'] + sample['response']
        dataset = 'metamath'
    elif sample['conversation']:
        text = []
        for t in sample['conversation']:
            text.append(" ".join(t.values()))
        text = " ".join(text)
        dataset = 'capybara'
    elif sample['instruction']:
        text = sample['prompt']
        dataset = 'code18k'
    elif sample['problem']:
        text = sample['problem'] + sample['solution'] + sample['answer']
        dataset = 'openmath'
    elif sample['turn_1']:
        try:
            text = sample['prompt'] + sample['turn_1'] + sample['feedback_1'] + sample['turn_2'] + sample['feedback_2']
        except:
            text = sample['prompt'] + sample['turn_1'] + sample['feedback_1']
        dataset = 'codeio'
    else:
        text = None
        dataset = None
    if add_text:
        return {'text': f'{text}',
                'dataset': f'{dataset}'}
    else:
        return {'dataset': f'{dataset}'}

In [7]:
all_data = dataset_concat(raw_datasets)
all_data = all_data.map(lambda sample: add_text_or_dataset(sample=sample, add_text=False))
all_data.save_to_disk('./datasets/all_data')
all_data


Concatnating: 100%|██████████| 5/5 [00:00<00:00, 26.48it/s]
Saving the dataset (11/11 shards): 100%|██████████| 225993/225993 [00:04<00:00, 48749.14 examples/s]


Dataset({
    features: ['type', 'query', 'original_question', 'response', 'index', 'source', 'conversation', 'instruction', 'input', 'output', 'prompt', 'problem', 'solution', 'answer', 'problem_type', 'question_type', 'uuid', 'is_reasoning_complete', 'generations', 'correctness_math_verify', 'correctness_llama', 'finish_reasons', 'correctness_count', 'messages', 'turn_1', 'feedback_1', 'turn_2', 'feedback_2', 'dataset'],
    num_rows: 225993
})

In [None]:
all_data_shuffled6 = all_data.shuffle(seed=6)
texts = all_data_shuffled6.map(add_text_or_dataset)
texts_extracted = texts.remove_columns([col for col in texts.column_names \
                                        if col != 'text' and col != 'dataset' and col != 'index'])

with open('./jsonl/all_data_to_bert.jsonl', 'w', encoding='utf-8') as f:
    for text in tqdm(texts_extracted):
        f.write(json.dumps(text) + "\n")


100%|██████████| 225993/225993 [00:07<00:00, 28820.20it/s]


In [None]:
tb_filter = TinyBERTFilter(model_path='./tinybert-filter',
                           data_to_label='./jsonl/all_data_to_bert.jsonl')
tb_filter.start_label()
tb_filter.save_labelled_data()

In [3]:
ds_origin = load_from_disk('./datasets/all_data')
filtered_data = load_from_disk('./datasets/labelled_data_filtered')
indices = filtered_data['index']
ds_filtered = ds_origin.select(indices)
ds_filtered
ds_origin

Dataset({
    features: ['type', 'query', 'original_question', 'response', 'index', 'source', 'conversation', 'instruction', 'input', 'output', 'prompt', 'problem', 'solution', 'answer', 'problem_type', 'question_type', 'uuid', 'is_reasoning_complete', 'generations', 'correctness_math_verify', 'correctness_llama', 'finish_reasons', 'correctness_count', 'messages', 'turn_1', 'feedback_1', 'turn_2', 'feedback_2', 'dataset'],
    num_rows: 225993
})

In [5]:
ds_formated = []
for sample in tqdm(ds_filtered, desc='Formatting'):
    if sample['query']:
        ds_formated.append({
            'data_type': sample['dataset'],
            'conversation':[{
                'from': 'human',
                'value': sample['query']
            }, {
                'from': 'gpt',
                'value': sample['response']
            }]
        })
    elif sample['conversation']:
        conversation = []
        for dialogue in sample['conversation']:
            conversation.append({
                'from': 'human',
                'value': dialogue['input']
            })
            conversation.append({
                'from': 'gpt',
                'value': dialogue['output']
            })
        ds_formated.append({
            'data_type': sample['dataset'],
            'conversation': conversation
        })
    elif sample['instruction']:
        prompt = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n'
        ds_formated.append({
            'data_type': sample['dataset'],
            'conversation': [{
                'from': 'human',
                'value': prompt + "###Instruction:\n" + sample['instruction'] + "###Input:\n" + sample['input']
            }, {
                'from': 'gpt',
                'value': sample['output']
            }]
        })
    elif sample['problem']:
        ds_formated.append({
            'data_type': sample['dataset'],
            'conversation': [{
                'from': 'human',
                'value': sample['problem']
            }, {
                'from': 'gpt',
                'value': sample['solution']
            }]
        })
    elif sample['turn_1']:
        if sample['turn_2']:
            ds_formated.append({
                'data_type': sample['dataset'],
                'conversation': [{
                    'from': 'human',
                    'value': sample['prompt']
                }, {
                    'from': 'gpt',
                    'value': sample['turn_1']
                }, {
                    'from': 'human',
                    'value': sample['feedback_1']
                }, {
                    'from': 'gpt',
                    'value': 'turn_2'
                }]
            })
        else:
            ds_formated.append({
                'data_type': sample['dataset'],
                'conversation': [{
                    'from': 'human',
                    'value': sample['prompt']
                }, {
                    'from': 'gpt',
                    'value': sample['turn_1']
                }]
            })

ds_formated = Dataset.from_list(ds_formated)
ds_formated.save_to_disk('SFTdata')
ds_formated

Formatting: 100%|██████████| 129311/129311 [00:22<00:00, 5774.90it/s]
Saving the dataset (1/1 shards): 100%|██████████| 129311/129311 [00:00<00:00, 1049814.07 examples/s]


Dataset({
    features: ['data_type', 'conversation'],
    num_rows: 129311
})

In [37]:
sftdata = load_from_disk('SFTdata')
dtype = np.array(sftdata['data_type'])
columns=['metamath', 'openmath', 'capybara', 'code18k', 'codeio']
counts = {col: np.sum(dtype == col).item() for col in columns}
sftdtype = pd.DataFrame(counts, index=['num'])
sftdtype

Unnamed: 0,metamath,openmath,capybara,code18k,codeio
num,48133,46123,9850,14582,10623
