@inproceedings{swayamdipta2020dataset,
    title={Dataset Cartography: Mapping and Diagnosing Datasets with Training Dynamics},
    author={Swabha Swayamdipta and Roy Schwartz and Nicholas Lourie and Yizhong Wang and Hannaneh Hajishirzi and Noah A. Smith and Yejin Choi},
    booktitle={Proceedings of EMNLP},
    url={https://arxiv.org/abs/2009.10795},
    year={2020}
}

In [2]:
# Dependencies: Transformers, pandas, and other required libraries are assumed to be pre-installed.

import numpy as np
import pandas as pd
import sklearn
import os
import json
import uuid
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split



In [7]:
# File paths
data_path = 'data/Measuring Hate Speech.csv'
output_dir = f"cartography_output_{str(uuid.uuid4())[:8]}"
os.makedirs(output_dir, exist_ok=True)

text_column = 'text'
label_column = 'hatespeech'

# Load dataset
df = pd.read_csv(data_path)
df = df[[text_column, label_column]]  # Modify this to match the columns in your dataset

In [8]:
# Preprocessing
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
df['input_ids'] = df[text_column].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length'))

# Split dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Save split datasets to JSONL format for cartography training
def save_to_jsonl(df, file_name):
    df_list = df[['input_ids', label_column]].to_dict(orient='records')
    with open(file_name, 'w') as f:
        for record in df_list:
            f.write(json.dumps(record) + '\n')

save_to_jsonl(train_df, f'{output_dir}/train.jsonl')
save_to_jsonl(val_df, f'{output_dir}/val.jsonl')



In [9]:
# Config file for cartography model
config = {
    "data_dir": output_dir,
    "model_type": "roberta",
    "model_name_or_path": "roberta-base",
    "task_name": "hate_speech",
    "seed": 42,
    "num_train_epochs": 3,
    "learning_rate": 2e-5,
    "features_cache_dir": f"{output_dir}/cache",
    "per_gpu_train_batch_size": 16
}

# Save config file
with open(f'{output_dir}/config.json', 'w') as f:
    json.dump(config, f)

In [14]:
#!git clone https://github.com/CapstoneProject33/cartography/

# Train model using cartography
!python -m cartography.cartography.classification.run_glue -c {output_dir}/config.json --do_train --do_eval -o {output_dir}

# Plotting training dynamics using cartography
!python -m cartography.cartography.selection.train_dy_filtering --plot --task_name "hate_speech" --model_dir {output_dir} --model "roberta-base"

Traceback (most recent call last):
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/knify/Documents/401 Capstone Measuring Hate Speech/cartography/cartography/classification/run_glue.py", line 46, in <module>
    from cartography.classification.glue_utils import adapted_glue_compute_metrics as compute_metrics
ModuleNotFoundError: No module named 'cartography.classification'
Traceback (most recent call last):
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Library/Developer/CommandLineTools/Library/Fra