In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, BertModel, BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments


In [6]:
# ---------------------
# Load and prepare data
# ---------------------
df = pd.read_json('data/TRDataChallenge2023.txt', lines=True)

# Fill empty postures as empty list
df['postures'] = df['postures'].apply(lambda x: x if isinstance(x, list) else [])

# Flatten paragraphs to a single string
def flatten_sections(sections):
    paragraphs = []
    for sec in sections:
        paragraphs.extend(sec.get('paragraphs', []))
    return ' '.join(paragraphs)

# Prepare text
df['text'] = df['sections'].apply(flatten_sections)
df['text']

0        Plaintiff Dwight Watson (“Husband”) appeals fr...
1        After pleading guilty, William Jerome Howard, ...
2        Frederick Greene, the plaintiff below, derivat...
3        Appeal from an amended judgment of the Supreme...
4        Order, Supreme Court, New York County (Arthur ...
                               ...                        
17995    ¶1 On February 5, 2017, a jury in the Fifth Ju...
17996    On April 17, 2019 the Court held a hearing on ...
17997    A jury convicted Antonio Avila Medrano of Cons...
17998    Defendant Charles York Walker, Jr., appeals fr...
17999    A parent has a fundamental right, protected by...
Name: text, Length: 18000, dtype: object

In [7]:

# ---------------------
# Encode labels
# ---------------------
all_labels = df['postures'].tolist()
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(all_labels)

# ---------------------
# Split data
# ---------------------
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    y,
    test_size=0.2,
    random_state=42
)



In [8]:
# ---------------------
# Tokenization
# ---------------------
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# ---------------------
# Dataset class
# ---------------------
class PostureDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = PostureDataset(train_encodings, train_labels)
val_dataset = PostureDataset(val_encodings, val_labels)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
# ---------------------
# Model
# ---------------------
num_labels = len(mlb.classes_)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels, problem_type="multi_label_classification")

# ---------------------
# Trainer setup
# ---------------------
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=50,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# ---------------------
# Train
# ---------------------
trainer.train()

# ---------------------
# Evaluate
# ---------------------
eval_results = trainer.evaluate()
print(eval_results)

# ---------------------
# Save label encoder for inference
# ---------------------
import joblib
joblib.dump(mlb, 'mlb_postures.pkl')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 