In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

df = pd.read_csv("social_issues_dataset_multitag_large.csv")

df['title'] = df['title'].fillna("")
df['description'] = df['description'].fillna("")
df['text'] = df['title'] + ". " + df['description']

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

mlb = MultiLabelBinarizer()
train_tags = mlb.fit_transform(train_df['tags'].apply(eval))
val_tags = mlb.transform(val_df['tags'].apply(eval))

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch.nn as nn
import torch

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_df['text'].tolist(), truncation=True, padding=True, max_length=256)

class MultiLabelDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self): return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).float()
        return item

train_dataset = MultiLabelDataset(train_encodings, train_tags)
val_dataset = MultiLabelDataset(val_encodings, val_tags)

# class MultiLabelBert(nn.Module):
#     def __init__(self, num_labels):
#         super().__init__()
#         self.bert = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels, problem_type="multi_label_classification")
#         self.sigmoid = nn.Sigmoid()
#     def forward(self, **x):
#         out = self.bert(**x)
#         out.logits = self.sigmoid(out.logits)
#         return out

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
).cuda()

args = TrainingArguments(
    output_dir="./tag_model",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.258524
2,No log,0.193784
3,No log,0.181733


TrainOutput(global_step=255, training_loss=0.26610143324908087, metrics={'train_runtime': 64.5884, 'train_samples_per_second': 62.705, 'train_steps_per_second': 3.948, 'total_flos': 64525108493700.0, 'train_loss': 0.26610143324908087, 'epoch': 3.0})

In [5]:
inputs = tokenizer("Overflowing garbage bins in park", return_tensors="pt").to("cuda")
outputs = model(**inputs)
preds = (outputs.logits > 0.5).int().squeeze().cpu().tolist()
tags = [tag for i, tag in enumerate(mlb.classes_) if preds[i]]
print("Predicted Tags:", tags)

Predicted Tags: ['garbage']


In [6]:
# Save tagging model and tokenizer
model.save_pretrained("./tagging_model")
tokenizer.save_pretrained("./tagging_model")

# Also save your MultiLabelBinarizer (used to inverse the prediction labels)
import joblib
joblib.dump(mlb, "mlb.pkl")  # mlb = MultiLabelBinarizer()


['mlb.pkl']