In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['WANDB_MODE'] = 'disabled'

In [2]:
from datasets import load_dataset

# 1. load dataset
ds = load_dataset('cardiffnlp/tweet_eval', 'hate', split='train')
ds = ds.select_columns(['text'])
ds = ds.rename_column('text', 'sentence')

In [3]:
from keybert import KeyBERT

# 2. generate two keywords. Other keyword extraction models can also be used, such as tfidf, LLMs, etc.
keybert = KeyBERT()
ds = ds.map(lambda x: {'labels': " ".join([k[0] for k in keybert.extract_keywords(x['sentence'].lower())[:2]]), 'sentence': x['sentence'].lower()})


In [4]:
for obj in ds:
    print(obj)
    break


{'sentence': '@user nice new signage. are you not concerned by beatlemania -style hysterical crowds crongregating on you…', 'labels': 'signage crowds'}


In [5]:
from gen_dedup import GenDedup

gd = GenDedup('google/flan-t5-small')
gd.fit(ds, output_dir='./hate-dedup')

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,7.1333
20,5.3067
30,4.604
40,3.9115
50,3.6192
60,3.5651
70,3.1161
80,2.8041
90,2.9148
100,2.5582


In [6]:
gd.deduplicate('hate-dedup', max_label_words=2)

100%|██████████| 9000/9000 [07:55<00:00, 18.91it/s]


100 duplicate text detected!
8900 nonduplicate text detected!
Duplicate text has been saved to hate-dedup/duplicate.jsonl
Nonduplicate text has been saved to hate-dedup/nonduplicate.jsonl
