In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset, Dataset, load_from_disk
from tqdm import tqdm
import json

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
model_path = './tinybert-filter'
tokenizer = BertTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")
model = BertForSequenceClassification.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

with open('all_data_to_bert.jsonl', 'r', encoding='utf-8') as f:
    input_texts = [json.loads(line) for line in f]

scored_data = []
for row in tqdm(input_texts, desc="Labelling"):
    text = row['text']
    dataset = row['dataset']
    inputs = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    ).to(device)
    with torch.no_grad():
        output = model(**inputs)
        label = output.logits.argmax(axis=-1).item()
        scored_data.append({
            'label': label,
            'dataset': dataset,
            'text': text,
        })
    


Labelling: 100%|██████████| 226032/226032 [34:12<00:00, 110.13it/s]


In [17]:
filtered_data = []
with open('labelled_data.jsonl', 'w', encoding='utf-8') as f:
    for line in scored_data:
        if line['label'] == 1:
            filtered_data.append(line)
            f.write(json.dumps(line) + "\n")

filtered_data = Dataset.from_list(filtered_data)
filtered_data.save_to_disk('filtered_data')

Saving the dataset (1/1 shards): 100%|██████████| 129325/129325 [00:00<00:00, 1237799.20 examples/s]
