In [1]:
def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return previous_row[-1]

In [2]:
def similarity_coefficient(text1, text2):
    distance = levenshtein_distance(text1, text2)
    max_length = max(len(text1), len(text2))
    similarity = 1 - distance / max_length
    return similarity

In [3]:
def get_similar_field(sample):
    distances = []
    for comment in ['neutral_comment1', 'neutral_comment2', 'neutral_comment3']:
        if isinstance(sample[comment], float):
            continue
        distance = levenshtein_distance(sample[comment], sample['toxic_comment'])
        distances.append((distance, comment))
    distances.sort(key=lambda x: x[0])  # Sort by distance
    return distances[0][1]  # Select the one with the minimum distance

In [4]:
import pandas as pd

dev_df = pd.read_table("https://raw.githubusercontent.com/s-nlp/russe_detox_2022/main/data/input/dev.tsv", sep='\t')
test_df = pd.read_table("https://raw.githubusercontent.com/s-nlp/russe_detox_2022/main/data/input/test.tsv", sep='\t')
train_df = pd.read_table("https://raw.githubusercontent.com/s-nlp/russe_detox_2022/main/data/input/train.tsv", sep='\t')

In [5]:
# DEV датасет
dev_dict = dev_df.to_dict(orient='index')
dev_samples = []
for sample in dev_dict:
    closest_neutral_comment = get_similar_field(dev_dict[sample])
    prepared = {
        "instruction": "Перефразируй нетоксичный текст так, чтобы он стал токсичным, сохраняя при этом исходный смысл, орфографию и пунктуацию.",
        "input": dev_dict[sample][closest_neutral_comment],
        "output": dev_dict[sample]['toxic_comment'],
    }
    dev_samples.append(prepared)

# TEST датасет
test_dict = test_df.to_dict(orient='index')
test_samples = []
for sample in test_dict:
    prepared = {
        "instruction": "Перефразируй нетоксичный текст так, чтобы он стал токсичным, сохраняя при этом исходный смысл, орфографию и пунктуацию.",
        "input": '',
        "output": test_dict[sample]['toxic_comment'],
    }
    test_samples.append(prepared)

# TRAIN датасет
train_dict = train_df.to_dict(orient='index')
train_samples = []
for sample in train_dict:
    closest_neutral_comment = get_similar_field(train_dict[sample])
    prepared = {
        "instruction": "Перефразируй нетоксичный текст так, чтобы он стал токсичным, сохраняя при этом исходный смысл, орфографию и пунктуацию.",
        "input": train_dict[sample][closest_neutral_comment],
        "output": train_dict[sample]['toxic_comment'],
    }
    train_samples.append(prepared)

In [8]:
from datasets import Dataset, DatasetDict

# Объединение отдельных датасетов в один датасет с разными сплитами
dataset_dict = DatasetDict({
    'dev': Dataset.from_list(dev_samples),
    'test': Dataset.from_list(test_samples),
    'train': Dataset.from_list(train_samples)
})

# Выгрузка датасета на Hugging Face
dataset_dict.push_to_hub('evilfreelancer/toxicator-ru')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/evilfreelancer/toxicator-ru/commit/4392d60d6e40119d6de9c2d80ec530b798647bb5', commit_message='Upload dataset', commit_description='', oid='4392d60d6e40119d6de9c2d80ec530b798647bb5', pr_url=None, pr_revision=None, pr_num=None)