In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset, Dataset, load_from_disk
from tqdm import tqdm
import json

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
model_path = './tinybert-filter'
tokenizer = BertTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
model = BertForSequenceClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

with open('./jsonl/all_data_to_bert.jsonl', 'r', encoding='utf-8') as f:
    input_texts = [json.loads(line) for line in f]

scored_data = []
for row in tqdm(input_texts, desc="Labelling"):
    text = row['text']
    index = row['index']
    dataset = row['dataset']
    inputs = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    ).to(device)
    with torch.no_grad():
        output = model(**inputs)
        label = output.logits.argmax(axis=-1).item()
        scored_data.append({
            'index': index,
            'label': label,
            'dataset': dataset,
            'text': text,
        })
    


Labelling: 100%|██████████| 225993/225993 [37:21<00:00, 100.81it/s]


In [13]:
filtered_data = []
with open('./jsonl/labelled_data_unfiltered.jsonl', 'w', encoding='utf-8') as f:
    for line in scored_data:
        if line['label'] == 1:
            filtered_data.append(line)
        f.write(json.dumps(line) + "\n")

filtered_data = Dataset.from_list(filtered_data)
filtered_data.save_to_disk('labelled_data_(filtered)')
filtered_data

Saving the dataset (1/1 shards): 100%|██████████| 129311/129311 [00:00<00:00, 828105.94 examples/s]


Dataset({
    features: ['index', 'label', 'dataset', 'text'],
    num_rows: 129311
})

In [14]:
ds_origin = load_from_disk('all_data')
indices = filtered_data['index']
ds_filtered = ds_origin.select(indices)
ds_filtered

Dataset({
    features: ['type', 'query', 'original_question', 'response', 'index', 'source', 'conversation', 'instruction', 'input', 'output', 'prompt', 'problem', 'solution', 'answer', 'problem_type', 'question_type', 'uuid', 'is_reasoning_complete', 'generations', 'correctness_math_verify', 'correctness_llama', 'finish_reasons', 'correctness_count', 'messages', 'turn_1', 'feedback_1', 'turn_2', 'feedback_2'],
    num_rows: 129311
})

In [None]:
def format(sample):
    if sample['query']:
        text = sample['query'] + sample['response']
        dataset = 'metamath'
    elif sample['conversation']:
        text = []
        for t in sample['conversation']:
            text.append(" ".join(t.values()))
        text = " ".join(text)
        dataset = 'capybara'
    elif sample['instruction']:
        text = sample['prompt']
        dataset = 'code18k'
    elif sample['problem']:
        text = sample['problem'] + sample['solution'] + sample['answer']
        dataset = 'openmath'
    elif sample['turn_1']:
        try:
            text = sample['prompt'] + sample['turn_1'] + sample['feedback_1'] + sample['turn_2'] + sample['feedback_2']
        except:
            text = sample['prompt'] + sample['turn_1'] + sample['feedback_1']
        dataset = 'codeio'
    else:
        text = None
        dataset = None
    return {'text': f'{text}',
            'dataset': f'{dataset}'}