In [1]:
from datasets import load_dataset

hf_site_id = '2024-mcm-everitt-ryan'
dataset_id = f'{hf_site_id}/job-bias-synthetic-human-benchmark'


dataset = load_dataset(dataset_id)
df = dataset['train'].to_pandas()
df

In [2]:
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load your dataset
#df = pd.read_parquet('../../../../dataset/final/final-train.parquet')

# Define the label columns
label_columns = [col for col in df.columns if col.startswith('label_')]

# Convert labels to a comma-separated string
def labels_to_string(row):
    labels = [col for col in label_columns if row[col]]
    return ', '.join(labels) if labels else 'none'

df['labels'] = df.apply(labels_to_string, axis=1)

# Create a Hugging Face Dataset
dataset = Dataset.from_pandas(df[['text', 'labels']])

# Tokenizer
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-small')

def preprocess_function(examples):
    inputs = [f"classify: {text}" for text in examples['text']]
    targets = [labels for labels in examples['labels']]
    model_inputs = tokenizer(inputs, max_length=512, padding=True, truncation=True)
    labels = tokenizer(targets, max_length=128, padding=True, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Load model
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-small')

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Split your dataset if possible
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Classification function
def classify_text(text, model, tokenizer, label_columns):
    input_text = f"classify: {text}"
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)
    outputs = model.generate(**inputs)
    predicted_labels = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predicted_labels = [label.strip() for label in predicted_labels.split(',')]
    label_dict = {label: False for label in label_columns}
    for label in predicted_labels:
        if label in label_dict:
            label_dict[label] = True
    return label_dict

# Example text
text = "This is an example text that needs classification."

# Get the predicted labels
predicted_labels = classify_text(text, model, tokenizer, label_columns)

# Print the predicted labels
print(predicted_labels)
