In [1]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="Manual Annotated -Done - fake_news_detection_dataset_200.csv")

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category', 'claim_idn', 'evidence_idn', 'claim_arb', 'evidence_arb'],
        num_rows: 200
    })
})

In [10]:
from datasets import DatasetDict

# Function to generate combinations
def generate_combinations(batch):
    # Initialize lists to collect the new data
    new_claim_ids = []
    new_claims = []
    new_evidences = []
    new_evidence_labels = []
    new_labels = []
    new_categories = []
    new_claim_languages = []
    new_evidence_languages = []

    # Iterate over the batch
    for i in range(len(batch['claim'])):
        claims = {
            'en': batch['claim'][i],
            'idn': batch['claim_idn'][i],
            'arb': batch['claim_arb'][i]
        }
        evidences = {
            'en': batch['evidence'][i],
            'idn': batch['evidence_idn'][i],
            'arb': batch['evidence_arb'][i]
        }

        # Generate combinations
        for claim_lang, claim_text in claims.items():
            for evidence_lang, evidence_text in evidences.items():
                new_claim_ids.append(batch['claim_id'][i])
                new_claims.append(claim_text)
                new_evidences.append(evidence_text)
                new_evidence_labels.append(batch['evidence_label'][i])
                new_labels.append(batch['label'][i])
                new_categories.append(batch['category'][i])
                new_claim_languages.append(claim_lang)
                new_evidence_languages.append(evidence_lang)

    # Return a dictionary of lists
    return {
        'claim_id': new_claim_ids,
        'claim': new_claims,
        'evidence': new_evidences,
        'evidence_label': new_evidence_labels,
        'label': new_labels,
        'category': new_categories,
        'claim_language': new_claim_languages,
        'evidence_language': new_evidence_languages
    }

# Process each split individually
new_datasets = {}

for split in dataset.keys():
    print(f"Processing split: {split}")
    split_dataset = dataset[split]

    # Apply the mapping function to generate combinations
    new_split_dataset = split_dataset.map(
        generate_combinations,
        remove_columns=['claim', 'evidence', 'claim_idn', 'evidence_idn', 'claim_arb', 'evidence_arb'],
        batched=True,
    )

    # Add the new split dataset to the new_datasets dict
    new_datasets[split] = new_split_dataset

# Create a new DatasetDict with the processed splits
new_dataset = DatasetDict(new_datasets)

# Create subsets for each language combination
subsets = {}
languages = ['en', 'idn', 'arb']

for split in new_dataset.keys():
    split_dataset = new_dataset[split]
    for claim_lang in languages:
        for evidence_lang in languages:
            subset_name = f'{split}_claim_{claim_lang}_evidence_{evidence_lang}'
            filtered_dataset = split_dataset.filter(
                lambda example: example['claim_language'] == claim_lang and example['evidence_language'] == evidence_lang
            )
            subsets[subset_name] = filtered_dataset  # filtered_dataset is a Dataset

# Create a DatasetDict from the subsets
combined_dataset = DatasetDict(subsets)

# Verify the combined dataset
print(combined_dataset)

Processing split: train


Filter:   0%|          | 0/1800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1800 [00:00<?, ? examples/s]

DatasetDict({
    train_claim_en_evidence_en: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category', 'claim_language', 'evidence_language'],
        num_rows: 200
    })
    train_claim_en_evidence_idn: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category', 'claim_language', 'evidence_language'],
        num_rows: 200
    })
    train_claim_en_evidence_arb: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category', 'claim_language', 'evidence_language'],
        num_rows: 200
    })
    train_claim_idn_evidence_en: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category', 'claim_language', 'evidence_language'],
        num_rows: 200
    })
    train_claim_idn_evidence_idn: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category', 'claim_language', 'evidence_language'],


In [11]:
combined_dataset.push_to_hub("fake_news_detection_dataset_cross_lingual", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Erland/fake_news_detection_dataset_cross_lingual/commit/13a66b9628c0ce7f4c1dee40cac9f0286115c820', commit_message='Upload dataset', commit_description='', oid='13a66b9628c0ce7f4c1dee40cac9f0286115c820', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Erland/fake_news_detection_dataset_cross_lingual', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Erland/fake_news_detection_dataset_cross_lingual'), pr_revision=None, pr_num=None)

In [15]:
from datasets import load_dataset

dataset = load_dataset("Erland/fake_news_detection_dataset_cross_lingual")
dataset

DatasetDict({
    train_claim_en_evidence_en: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category', 'claim_language', 'evidence_language'],
        num_rows: 200
    })
    train_claim_en_evidence_idn: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category', 'claim_language', 'evidence_language'],
        num_rows: 200
    })
    train_claim_en_evidence_arb: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category', 'claim_language', 'evidence_language'],
        num_rows: 200
    })
    train_claim_idn_evidence_en: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category', 'claim_language', 'evidence_language'],
        num_rows: 200
    })
    train_claim_idn_evidence_idn: Dataset({
        features: ['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category', 'claim_language', 'evidence_language'],


# Transform into text

In [16]:
# Define the formatting function
def format_examples(batch):
    texts = [f"{claim}. Evidence: {evidence}" for claim, evidence in zip(batch['claim'], batch['evidence'])]
    labels = batch['evidence_label']
    return {'text': texts, 'labels': labels}

# Apply the function to each subset
for subset_name in dataset.keys():
    print(f"Processing subset: {subset_name}")
    subset_dataset = dataset[subset_name]
    formatted_dataset = subset_dataset.map(
        format_examples,
        batched=True,
        remove_columns=subset_dataset.column_names
    )
    dataset[subset_name] = formatted_dataset

# Verify the transformation
print(dataset)

# Print a few examples from one of the subsets
for example in dataset[list(dataset.keys())[0]].select(range(3)):
    print(example)



Processing subset: train_claim_en_evidence_en


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Processing subset: train_claim_en_evidence_idn


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Processing subset: train_claim_en_evidence_arb


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Processing subset: train_claim_idn_evidence_en


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Processing subset: train_claim_idn_evidence_idn


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Processing subset: train_claim_idn_evidence_arb


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Processing subset: train_claim_arb_evidence_en


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Processing subset: train_claim_arb_evidence_idn


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Processing subset: train_claim_arb_evidence_arb


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train_claim_en_evidence_en: Dataset({
        features: ['text', 'labels'],
        num_rows: 200
    })
    train_claim_en_evidence_idn: Dataset({
        features: ['text', 'labels'],
        num_rows: 200
    })
    train_claim_en_evidence_arb: Dataset({
        features: ['text', 'labels'],
        num_rows: 200
    })
    train_claim_idn_evidence_en: Dataset({
        features: ['text', 'labels'],
        num_rows: 200
    })
    train_claim_idn_evidence_idn: Dataset({
        features: ['text', 'labels'],
        num_rows: 200
    })
    train_claim_idn_evidence_arb: Dataset({
        features: ['text', 'labels'],
        num_rows: 200
    })
    train_claim_arb_evidence_en: Dataset({
        features: ['text', 'labels'],
        num_rows: 200
    })
    train_claim_arb_evidence_idn: Dataset({
        features: ['text', 'labels'],
        num_rows: 200
    })
    train_claim_arb_evidence_arb: Dataset({
        features: ['text', 'labels'],
        num_rows: 200


In [17]:
combined_dataset.push_to_hub("fake_news_detection_dataset_cross_lingual_formatted", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Erland/fake_news_detection_dataset_cross_lingual_formatted/commit/27e9ee1d1e181cf54dce1563366a234f63f3ddc1', commit_message='Upload dataset', commit_description='', oid='27e9ee1d1e181cf54dce1563366a234f63f3ddc1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Erland/fake_news_detection_dataset_cross_lingual_formatted', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Erland/fake_news_detection_dataset_cross_lingual_formatted'), pr_revision=None, pr_num=None)

In [18]:
dataset = load_dataset("Erland/fake_news_detection_dataset_cross_lingual_formatted", split="train_claim_en_evidence_en")

README.md:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

(…)im_en_evidence_en-00000-of-00001.parquet:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

(…)m_en_evidence_idn-00000-of-00001.parquet:   0%|          | 0.00/41.9k [00:00<?, ?B/s]

(…)m_en_evidence_arb-00000-of-00001.parquet:   0%|          | 0.00/52.0k [00:00<?, ?B/s]

(…)m_idn_evidence_en-00000-of-00001.parquet:   0%|          | 0.00/42.8k [00:00<?, ?B/s]

(…)_idn_evidence_idn-00000-of-00001.parquet:   0%|          | 0.00/38.2k [00:00<?, ?B/s]

(…)_idn_evidence_arb-00000-of-00001.parquet:   0%|          | 0.00/51.8k [00:00<?, ?B/s]

(…)m_arb_evidence_en-00000-of-00001.parquet:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

(…)_arb_evidence_idn-00000-of-00001.parquet:   0%|          | 0.00/49.8k [00:00<?, ?B/s]

(…)_arb_evidence_arb-00000-of-00001.parquet:   0%|          | 0.00/53.4k [00:00<?, ?B/s]

Generating train_claim_en_evidence_en split:   0%|          | 0/200 [00:00<?, ? examples/s]

Generating train_claim_en_evidence_idn split:   0%|          | 0/200 [00:00<?, ? examples/s]

Generating train_claim_en_evidence_arb split:   0%|          | 0/200 [00:00<?, ? examples/s]

Generating train_claim_idn_evidence_en split:   0%|          | 0/200 [00:00<?, ? examples/s]

Generating train_claim_idn_evidence_idn split:   0%|          | 0/200 [00:00<?, ? examples/s]

Generating train_claim_idn_evidence_arb split:   0%|          | 0/200 [00:00<?, ? examples/s]

Generating train_claim_arb_evidence_en split:   0%|          | 0/200 [00:00<?, ? examples/s]

Generating train_claim_arb_evidence_idn split:   0%|          | 0/200 [00:00<?, ? examples/s]

Generating train_claim_arb_evidence_arb split:   0%|          | 0/200 [00:00<?, ? examples/s]

In [19]:
dataset

Dataset({
    features: ['label', 'text'],
    num_rows: 200
})