In [None]:
import datasets
import textstat
import re
import pandas as pd
import numpy as np
from snorkel.labeling import labeling_function
from snorkel.preprocess import preprocessor
from snorkel.labeling import LabelingFunction
from snorkel.labeling import PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel
from snorkel.analysis import metric_score
from snorkel.utils import probs_to_preds
from sklearn.model_selection import train_test_split
from lexical_diversity import lex_div as ld
from nltk.sentiment.vader import SentimentIntensityAnalyzer
vader = SentimentIntensityAnalyzer(lexicon_file='./vader_lexicon.txt')


# Dataset

In [None]:
df = pd.DataFrame(datasets.load_dataset('Anthropic/hh-rlhf')['train'])

## Preparation

Changing Format: chosen & rejected text only last Assistant utterance

In [None]:
df['conversation'] = df['chosen'].apply(lambda text: text[:text.rfind('Assistant:') + 11])
df['chosen'] = df['chosen'].apply(lambda text: text[text.rfind('Assistant:') + 11:])
df['rejected'] = df['rejected'].apply(lambda text: text[text.rfind('Assistant:') + 11:])

Chosen & Rejected -> answer_0 & answer_1 & label

In [None]:
np.random.seed(123)
df.rename(columns={'chosen': 'answer_0', 'rejected': 'answer_1'}, inplace=True)
df['label'] = 0
swap_mask = np.random.choice([True, False], size=df.shape[0])
df.loc[swap_mask, ['answer_0', 'answer_1']] = df.loc[swap_mask, ['answer_1', 'answer_0']].values
df.loc[swap_mask, 'label'] = 1

## Split

In [None]:
BASELINE_SPLIT = 0.1

df_remaining, df_test = train_test_split(df, test_size=0.1, random_state=123)
df_weak, df_train = train_test_split(df_remaining, test_size=BASELINE_SPLIT, random_state=123)

df_weak.shape, df_train.shape, df_test.shape

# Labeling Functions

In [None]:
ABSTAIN = -1
PREFER_ANSWER_0 = 0
PREFER_ANSWER_1 = 1

Length

In [None]:
LENGTH_DIFFERENCE_LOWER_THRESHOLD = 20

@labeling_function()
def length(x):
    if abs(len(x.answer_1) - len(x.answer_0)) < LENGTH_DIFFERENCE_LOWER_THRESHOLD:
        return ABSTAIN
    return PREFER_ANSWER_1 if len(x.answer_1) > len(x.answer_0) else PREFER_ANSWER_0

Reading ease

In [None]:
READING_EASE_DIFFERENCE_LOWER_THRESHOLD = 5
READING_EASE_DIFFERENCE_UPPER_THRESHOLD = 50

@preprocessor(memoize=True)
def reading_ease_processor(x):
    x.answer_0_reading_ease = textstat.flesch_reading_ease(x.answer_0)
    x.answer_1_reading_ease = textstat.flesch_reading_ease(x.answer_1)
    return x

@labeling_function(pre=[reading_ease_processor])
def reading_ease(x):
    if abs(x.answer_0_reading_ease - x.answer_1_reading_ease) > READING_EASE_DIFFERENCE_UPPER_THRESHOLD or abs(x.answer_0_reading_ease - x.answer_1_reading_ease) < READING_EASE_DIFFERENCE_LOWER_THRESHOLD:
        return ABSTAIN
    return PREFER_ANSWER_1 if x.answer_1_reading_ease < x.answer_0_reading_ease else PREFER_ANSWER_0

Lexical diversity

In [None]:
LEXICAL_DIVERSITY_DIFFERENCE_LOWER_THRESHOLD = 0.1

@preprocessor(memoize=True)
def lexical_diversity_processor(x):
    x.answer_0_lexical_diversity = ld.ttr(ld.flemmatize(x.answer_0))
    x.answer_1_lexical_diversity = ld.ttr(ld.flemmatize(x.answer_1))
    return x

@labeling_function(pre=[lexical_diversity_processor])
def lexical_diversity(x):
    if abs(x.answer_0_lexical_diversity - x.answer_1_lexical_diversity) < LEXICAL_DIVERSITY_DIFFERENCE_LOWER_THRESHOLD:
        return ABSTAIN
    return PREFER_ANSWER_1 if x.answer_1_lexical_diversity < x.answer_0_lexical_diversity else PREFER_ANSWER_0

Vader polarity

In [None]:
VADER_POLARITY_DIFFERENCE_LOWER_THRESHOLD = 0.05

@preprocessor(memoize=True)
def sentiment_processor(x):
    x.answer_0_polarity = vader.polarity_scores(x.answer_0)['compound']
    x.answer_1_polarity = vader.polarity_scores(x.answer_1)['compound']
    return x

@labeling_function(pre=[sentiment_processor])
def vader_polarity(x):
    if abs(x.answer_0_polarity - x.answer_1_polarity) < VADER_POLARITY_DIFFERENCE_LOWER_THRESHOLD:
        return ABSTAIN
    return PREFER_ANSWER_1 if x.answer_1_polarity > x.answer_0_polarity else PREFER_ANSWER_0

Amount of numbers

In [None]:
AMOUTN_OF_NUMBERS_DIFFERENCE_LOWER_THRESHOLD = 3

@preprocessor(memoize=True)
def amount_of_numbers_processor(x):
    x.answer_0_amount_of_numbers = len(re.findall(r'\b\d+\b', x.answer_0))
    x.answer_1_amount_of_numbers = len(re.findall(r'\b\d+\b', x.answer_1))
    return x

@labeling_function(pre=[amount_of_numbers_processor])
def amount_of_numbers(x):
    if abs(x.answer_0_amount_of_numbers- x.answer_1_amount_of_numbers) < AMOUTN_OF_NUMBERS_DIFFERENCE_LOWER_THRESHOLD:
        return ABSTAIN
    return PREFER_ANSWER_1 if x.answer_1_amount_of_numbers > x.answer_0_amount_of_numbers else PREFER_ANSWER_0

In [None]:
good_regular_expressions = [
    r'\b(?:long|extensive) (?:history|background)\b',
    r'\b(?:various|several|many) (?:options|choices)\b', 
    r'.*same.*again.*',
    r'Can you clarify.*', 
    r'You’re welcome', 
    r'According to.*',
    r'\b(\w+)\s+\1\b', 
    r"(?:it could be|it's possible that) \b\w+\b",
    r'\b(?:hard|tricky) to (?:say|determine|decide)\b',
    r'(?:(?:for example|such as) \b\w+\b)',
    r"\b(?:pros and cons|advantages and disadvantages)\b",
    r"be (kind|compassionate|understanding)",
    r"(?:overview|summary) of",
    r"(understand|comprehend|grasp)\b.*\b(your|the) (question|point|concern)",
]

bad_regular_expressions = [
    r'\b(?:I guess|maybe|perhaps)\b.*', 
    r'I see.*', '(.)\1{3,}',
    r'\b(?:best|most effective) (?:practice|approach)\b',
    r'\b(?:not sure|unclear) (?:about|regarding)\b',
    r"(?:I'm not sure|I don't know) (?:about|if)", 
    r"\b(how|what|why|where|when|who)\b.{0,95}[\?\!]",
]

@preprocessor(memoize=True)
def regular_expressions_processor(x):
    x.answer_0_good_regular_expressions = sum([1 for regex in good_regular_expressions if re.search(regex, x.answer_0)])
    x.answer_1_good_regular_expressions = sum([1 for regex in good_regular_expressions if re.search(regex, x.answer_1)])
    x.answer_0_bad_regular_expressions = sum([1 for regex in bad_regular_expressions if re.search(regex, x.answer_0)])
    x.answer_1_bad_regular_expressions = sum([1 for regex in bad_regular_expressions if re.search(regex, x.answer_1)])
    return x

@labeling_function(pre=[regular_expressions_processor])
def regex(x):
    if x.answer_0_good_regular_expressions - x.answer_0_bad_regular_expressions == x.answer_1_good_regular_expressions - x.answer_1_bad_regular_expressions:
        return ABSTAIN
    return PREFER_ANSWER_1 if x.answer_1_good_regular_expressions - x.answer_1_bad_regular_expressions > x.answer_0_good_regular_expressions - x.answer_0_bad_regular_expressions else PREFER_ANSWER_0

In [None]:
def keyword_lookup(x, keywords):
    x.answer_0_amount_of_word = len([word for word in keywords if word in x.answer_0.split(" ")])
    x.answer_1_amount_of_word = len([word for word in keywords if word in x.answer_1.split(" ")])
    if x.answer_0_amount_of_word == x.answer_1_amount_of_word:
        return ABSTAIN
    return PREFER_ANSWER_1 if x.answer_0_amount_of_word > x.answer_1_amount_of_word else PREFER_ANSWER_0


def make_keyword_lf(keywords, name):
    return LabelingFunction(
        name=f"keyword_{name}",
        f=keyword_lookup,
        resources=dict(keywords=keywords),
    )

# lists are from the internet
swear_words = ['anal', 'anus', 'arse', 'ass', 'stupid', 'ballsack', 'balls', 'bastard', 'bitch', 'biatch', 'bloody', 'blowjob', 'blow job', 'bollock', 'bollok', 'boner', 'boob', 'bugger', 'bum', 'butt', 'buttplug', 'clitoris', 'cock', 'coon', 'crap', 'cunt', 'damn', 'dick', 'dildo', 'dyke', 'fag', 'feck', 'fellate', 'fellatio', 'felching', 'fuck', 'f u c k', 'fudgepacker', 'fudge packer', 'flange', 'Goddamn', 'God damn', 'hell', 'homo', 'jerk', 'jizz', 'knobend', 'knob end', 'labia', 'lmao', 'lmfao', 'muff', 'nigger', 'nigga', 'omg', 'penis', 'piss', 'poop', 'prick', 'pube', 'pussy', 'queer', 'scrotum', 'sex', 'shit', 's hit', 'sh1t', 'slut', 'smegma', 'spunk', 'tit', 'tosser', 'turd', 'twat', 'vagina', 'wank', 'whore', 'wtf']
bad_words = ['raging boner', 'bukkake', 'gun', 'one guy one jar', 'fake', 'muff diver', 'coprolagnia', 'quim', 'babeland', 'yellow showers', 'suicide girls', 'panty', 'gangbang', 'jiggerboo', 'paedophile', 'girl on top', 'handjob', 'livesex', 'asshole', 'futanari', 'juggs', 'lolita', 'sexual', 'tranny', 'panties', 'deepthroat', 'jack off', 'big knockers', 'hooker', 'camgirl', 'dp action', 'big breasts', 'tea bagging', 'jail bait', 'viagra', 'doggie style', 'missionary position', 'erotism', 'fecal', 'felch', 'creampie', 'ejaculation', 'birdlock', 'escort', 'g-spot', 'camel toe', 'tight white', 'nympho', 'muffdiving', 'bareback', 'tribadism', 'eunuch', 'bimbos', 'pedobear', 'fucktards', 'splooge', 'blonde on blonde action', 'brown showers', 'rusty trombone', 'jiggaboo', 'nsfw images', 'sultry women', 'humping', 'skeet', 'carpetmuncher', 'dingleberries', 'mr hands', '2 girls 1 cup', 'vulva', 'rimjob', 'domination', 'strip club', 'beaver cleaver', 'violet wand', 'wrinkled starfish', 'wetback', 'strap on', 'semen', 'mound of venus', 'phone sex', 'voyuer', 'bangbros', 'swinger', 'cumshots', 'masturbation', 'dry hump', 'spread legs', 'pthc', 'shemale', 'dingleberry', 'leather straight jacket', 'coons', 'bangbus', 'arsehole', 'shitblimp', 'big tits', 'snowballing', 'alabama hot pocket', 'group sex', 'rape', 'date rape', 'nawashi', 'cornhole', 'pubes', 'poopchute', 'blue waffle', 'horny', 'masturbate', 'donkey punch', 'kike', 'booty call', 'hentai', 'ball sack', 'hot chick', 'coprophilia', 'ball sucking', 'giant cock', 'butthole', 'dirty sanchez', 'beaners', 'cumshot', 'hot carl', 'sucks', 'nambla', 'goregasm', 'ecchi', 'sexuality', 'santorum', 'poof', 'baby juice', 'femdom', 'one cup two girls', 'ball gravy', 'piece of shit', 'shitty', '2g1c', 'snatch', 'pornography', 'venus mound', 'poop chute', 'fisting', 'nutten', 'undressing', 'milf', 'rosy palm', 'male squirting', 'orgasm', 'big black', 'autoerotic', 'zoophilia', 'hardcore', 'nude', 'xxx', 'lovemaking', 'pisspig', 'yiffy', 'playboy', 'goodpoop', 'rimming', 'queaf', 'female squirting', 'shibari', 'bondage', 'titty', 'dvda', 'erotic', 'dominatrix', 'girl on', 'honkey', 'boobs', 'pegging', 'topless', 'sexo', 'raghead', 'baby batter', 'dommes', 'swastika', 'camwhore', 'nipples', 'sadism', 'urophilia', 'vibrator', 'dirty pillows', 'doggystyle', 'buttcheeks', 'mong', 'omorashi', 'scissoring', 'foot fetish', 'footjob', 'spic', 'golden shower', 'bung hole', 'shaved beaver', 'white power', 'porn', 'dog style', 'nsfw', 'prince albert piercing', 'black cock', 'blonde action', 'chocolate rosebuds', 'kinbaku', 'pissing', 'rapist', 'reverse cowgirl', 'bulldyke', 'poontang', 'raping', 'tubgirl', 'double dong', 'god damn', 'upskirt', 'xx', 'taste my', 'fuck buttons', 'tub girl', 'poon', 'nipple', 'knobbing', 'leather restraint', 'goatse', 'kinky', 'daterape', 'threesome', 'rectum', 'twinkie', 'bullet vibe', 'fuckin', 'cumming', 'bbw', 'hard core', 'blumpkin', 'paki', 'nimphomania', 'feltch', 'thumbzilla', 'gay sex', 'bitches', 'menage a trois', 'beaver lips', 'fingering', 'how to kill', 'queef', 'throating', 'gokkun', 'auto erotic', 'beaner', 'twink', 'clit', 'frotting', 'eat my ass', 'huge fat', 'tits', 'worldsex', 'make me come', 'anilingus', 'pleasure chest', 'goo girl', 'masturbating', 'pikey', 'style doggy', 'sexually', 'girls gone wild', 'nymphomania', 'hand job', 'dolcett', 'nudity', 'bastinado', 'piss pig', 'cialis', 'jailbait', 'lemon party', 'bollocks', 'grope', 's&m', 'vorarephilia', 'barenaked', 'how to murder', 'slanteye', 'cum', 'tongue in a', 'towelhead', 'voyeur', 'spooge', 'busty', 'voyeurweb', 'jelly donut', 'doggiestyle', 'blow your load', 'beastiality', 'sodomize', 'gang bang', 'neonazi', 'double penetration', 'sodomy', 'spastic', 'nig nog', 'brunette action', 'splooge moose', 'urethra play', 'barely legal', 'pole smoker', 'tied up', 'bastardo', 'bunghole', 'two girls one cup', 'orgy', 'doggy style', 'smut', 'genitals', 'guro', 'carpet muncher', 'shaved pussy', 'rosy palm and her 5 sisters', 'porno', 'scat', 'figging', 'ball licking', 'acrotomophilia', 'clusterfuck', 'motherfucker', 'apeshit', 'ponyplay', 'schlong', 'deep throat', 'assmunch', 'dendrophilia', 'titties', 'punany', 'suck', 'sexy', 'shrimping', 'faggot', 'tushy', 'incest', 'octopussy', 'wrapping men', 'wet dream', 'kinkster', 'jigaboo', 'fingerbang', 'strapon', 'bestiality', 'cleveland steamer', 'sexcam', 'ball gag', 'fucking', 'negro', 'goatcx', 'cunnilingus', 'homoerotic', 'strappado', 'circlejerk', 'bullshit', 'cocks', 'bdsm', 'tainted love', 'alaskan pipeline', 'ball kicking', 'yaoi', 'shota', 'pedophile', 'darkie', 'intercourse', 'jerk off', 'clover clamps', 'camslut']
harmful_words = ["steal", "theft", "victim", "rob", "shoot", "burglarize", "embezzle","fraud", "scam", "phish","hack", "malware", "virus", "trojan","assault", "battery", "fight", "attack","drugs", "meth", "cocaine", "heroin","kidnap", "abduct","murder", "kill", "homicide","bribe", "extort","smuggle", "trafficking","forge", "counterfeit","vandalize", "deface", "graffiti","trespass", "break-in","stalk", "harass"]
keywords_offensive = make_keyword_lf(swear_words, 'offensive')
keywords_bad = make_keyword_lf(bad_words, 'bad')
keywords_harmful = make_keyword_lf(harmful_words, 'harmful')

## Label Model

In [None]:
lfs = [
    length,
    reading_ease,
    lexical_diversity,
    vader_polarity,
    amount_of_numbers,
    regex,
    keywords_offensive,
    keywords_harmful,
    keywords_bad
]

Apply and evaluate LFs

In [None]:
applier = PandasLFApplier(lfs=lfs)
L_calibration = applier.apply(df=df_train)
LFAnalysis(L=L_calibration, lfs=lfs).lf_summary(df_train.label.values)

In [None]:
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_calibration, class_balance=[0.5,0.5], n_epochs=100, seed=123, l2=0.5, optimizer='adam')
probs_calibration = label_model.predict_proba(L=L_calibration)
preds_calibration = probs_to_preds(probs=probs_calibration)
accuracy = metric_score(golds=df_train.label.values, preds=preds_calibration, metric="accuracy")
print(f"Label Model accuracy on baseline calibration set: {accuracy * 100:.2f}%")

Applying to unlabeled data

In [None]:
L_weak = applier.apply(df=df_weak)

## Confidence

In [None]:
df_weak["weak_label"] = label_model.predict(L=L_weak, tie_break_policy="random")
df_weak['confidence'] = label_model.predict_proba(L=L_weak).max(axis=1)

CONFIDENCE_TRESHOLD = 0.0
df_weak = df_weak[df_weak.confidence > CONFIDENCE_TRESHOLD]

Preparation of weakly labeled set

In [None]:
df_weak['chosen'] = df_weak['answer_0']
df_weak['rejected'] = df_weak['answer_1']
df_weak.loc[df_weak.weak_label == 1, ['chosen', 'rejected']] = df_weak.loc[df_weak.weak_label == 1, ['rejected', 'chosen']].values
df_weak['chosen'] = df_weak['conversation'] + df_weak['chosen']
df_weak['rejected'] = df_weak['conversation'] + df_weak['rejected']
df_weak = df_weak[['chosen', 'rejected']]

Preparation of baseline set

In [None]:
df_train['chosen'] = df_train['answer_0']
df_train['rejected'] = df_train['answer_1']
df_train.loc[df_train.label == 1, ['chosen', 'rejected']] = df_train.loc[df_train.label == 1, ['rejected', 'chosen']].values
df_train['chosen'] = df_train['conversation'] + df_train['chosen']
df_train['rejected'] = df_train['conversation'] + df_train['rejected']
df_train = df_train[['chosen', 'rejected']]

Combination

In [None]:
df_export = pd.concat([df_weak, df_train])
df_export = df_export.sample(frac=1, random_state=123)

Export

In [None]:
DATASET_NAME = f'HH-RLHF-weakly-labeled-set_{df_train.shape[0]}x{df_weak.shape[0]}'
with open(f'../datasets/HH-RLHF/{DATASET_NAME}.jsonl', 'wt', encoding='utf-8') as f:
        f.write(df_export.copy().to_json(orient='records', lines=True))