In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

df = pd.read_csv("social_issues_dataset_multitag_large.csv")

df['title'] = df['title'].fillna("")
df['description'] = df['description'].fillna("")
df['text'] = df['title'] + ". " + df['description']

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

priority_labels = {"low": 0, "medium": 1, "high": 2}
train_df['priority_label'] = train_df['priority'].map(priority_labels)
val_df['priority_label'] = val_df['priority'].map(priority_labels)

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_df['text'].tolist(), truncation=True, padding=True)

class PriorityDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self): return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = PriorityDataset(train_encodings, train_df['priority_label'].tolist())
val_dataset = PriorityDataset(val_encodings, val_df['priority_label'].tolist())

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3).cuda()

training_args = TrainingArguments(
    output_dir="./priority_model",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.00527
2,No log,0.001925
3,No log,0.001519


TrainOutput(global_step=255, training_loss=0.12280076718797871, metrics={'train_runtime': 36.4378, 'train_samples_per_second': 111.148, 'train_steps_per_second': 6.998, 'total_flos': 32483551751100.0, 'train_loss': 0.12280076718797871, 'epoch': 3.0})

In [3]:
inputs = tokenizer("Pothole on road near school", return_tensors="pt").to("cuda")
outputs = model(**inputs)
pred = torch.argmax(outputs.logits, dim=1).item()
priority = list(priority_labels.keys())[list(priority_labels.values()).index(pred)]
print("Predicted Priority:", priority)

Predicted Priority: medium


In [4]:
model.save_pretrained("./priority_model")
tokenizer.save_pretrained("./priority_model")

('./priority_model\\tokenizer_config.json',
 './priority_model\\special_tokens_map.json',
 './priority_model\\vocab.txt',
 './priority_model\\added_tokens.json',
 './priority_model\\tokenizer.json')