In [None]:
from datasets import load_dataset
import pandas as pd

In [None]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
DATASET = "mosaicml/dolly_hhrlhf"

In [None]:
dolly_train = load_dataset(DATASET, split='train')
dolly_test = load_dataset(DATASET, split='test')

In [None]:
# Convert to pandas dataframes
dolly_train_df = pd.DataFrame(dolly_train)
dolly_test_df = pd.DataFrame(dolly_test)

In [None]:
# Remove the prompt template from the prompt column
dolly_train_df['prompt'] = dolly_train_df['prompt'].map(lambda x: x.split('\n\n### Instruction:\n')[1].split('\n\n### Response:\n')[0])
dolly_test_df['prompt'] = dolly_test_df['prompt'].map(lambda x: x.split('\n\n### Instruction:\n')[1].split('\n\n### Response:\n')[0])

In [None]:
# Check that the prompt template has been removed
print(dolly_train_df['prompt'][0])
print(dolly_test_df['prompt'][0])

In [None]:
# Clean the trailing new lines
dolly_train_df['prompt'] = dolly_train_df['prompt'].map(lambda x: x.strip())
dolly_train_df['response'] = dolly_train_df['response'].map(lambda x: x.strip())
dolly_test_df['prompt'] = dolly_test_df['prompt'].map(lambda x: x.strip())
dolly_test_df['response'] = dolly_test_df['response'].map(lambda x: x.strip())

In [None]:
# Remove duplicates
dolly_train_df = dolly_train_df[~dolly_train_df.duplicated()]
dolly_test_df = dolly_test_df[~dolly_test_df.duplicated()]

In [None]:
# There are some duplicates between the train and test sets
dolly_df = pd.concat([dolly_train_df, dolly_test_df])
dolly_df[dolly_df.duplicated()]

In [None]:
# I decide to remove the duplicates from the train set
dolly_train_df = pd.merge(dolly_train_df, dolly_test_df, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)

In [None]:
# Check that there are no duplicates
dolly_df = pd.concat([dolly_train_df, dolly_test_df])
dolly_df[dolly_df.duplicated()]

In [None]:
# Remove empty responses
dolly_train_df = dolly_train_df[dolly_train_df['response'] != '']
dolly_test_df = dolly_test_df[dolly_test_df['response'] != '']

In [None]:
# Checking short prompts 
print(dolly_train_df[dolly_train_df['prompt'].map(lambda x: len(x)) < 3])
print()
print(dolly_test_df[dolly_test_df['prompt'].map(lambda x: len(x)) < 3])

In [None]:
# Removing c from the train set
dolly_train_df = dolly_train_df[dolly_train_df['prompt'] != 'c']

In [None]:
SIMILARITY_THRESHOLD = 0.85

In [None]:
# Removing high similarity prompts, meaning that the prompt is very similar to the response because it includes the response. You can play with the threshold to see which are the samples that are going to be deleted.
dolly_train_df[dolly_train_df.apply(lambda x: similar(x['prompt'], x['response']), axis=1) >= SIMILARITY_THRESHOLD]

In [None]:
dolly_test_df[dolly_test_df.apply(lambda x: similar(x['prompt'], x['response']), axis=1) >= SIMILARITY_THRESHOLD]

In [None]:
# Remove high similarity prompts
dolly_train_df = dolly_train_df[dolly_train_df.apply(lambda x: similar(x['prompt'], x['response']), axis=1) < SIMILARITY_THRESHOLD]
dolly_test_df = dolly_test_df[dolly_test_df.apply(lambda x: similar(x['prompt'], x['response']), axis=1) < SIMILARITY_THRESHOLD]

In [None]:
# Creating the huggingface dataset from the pandas dataframe
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(dolly_train_df).remove_columns(['__index_level_0__'])
test_ds = Dataset.from_pandas(dolly_test_df).remove_columns(['__index_level_0__'])


ds = DatasetDict()

ds['train'] = train_ds
ds['test'] = test_ds

print(ds)

In [None]:
# You can upload the dataset to the HuggingFace Hub
ds.push_to_hub(DATASET)