In [47]:
import torch 
import pandas as pd 
import xml.etree.ElementTree as ET 
from sklearn.preprocessing import MultiLabelBinarizer 
from sklearn.metrics import f1_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

In [48]:
def parse_semeval_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    sentences = []
    labels = []
    
    for sentence_elem in root.findall('sentence'):
        text_elem = sentence_elem.find('text')
        
        if text_elem is not None and text_elem.text and text_elem.text.strip():
            sentences.append(text_elem.text)
        
            aspect_categories = sentence_elem.find('aspectCategories')
            if aspect_categories is not None:
                sentence_labels = [cat.get('category') for cat in aspect_categories.findall('aspectCategory')]
                labels.append(sentence_labels)
            else:
                labels.append([])
        else:
            continue
        
    return sentences, labels

In [49]:
train_path = './datasets/Restaurants_Train.xml'
test_path = './datasets/Restaurants_Test_Data_phaseB.xml'

train_texts, train_labels_str = parse_semeval_xml(train_path)
test_texts, test_labels_str = parse_semeval_xml(test_path)

print(f"Loaded {len(train_texts)} training sentences and {len(test_texts)} test sentences.")
print("Example training sentence:", train_texts[1])
print("Example training labels:", train_labels_str[1])

Loaded 3044 training sentences and 800 test sentences.
Example training sentence: To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora.
Example training labels: ['food', 'anecdotes/miscellaneous']


In [50]:
CATEGORIES = ['food', 'service', 'price', 'ambience', 'anecdotes/miscellaneous']

mlb = MultiLabelBinarizer(classes=CATEGORIES)

train_labels = mlb.fit_transform(train_labels_str)
test_labels = mlb.transform(test_labels_str)

print('labels have been binarized')
print(f'example binarized training labels: {train_labels[1]}')
print(f'label mapping: {list(mlb.classes_)}')

labels have been binarized
example binarized training labels: [1 0 0 0 1]
label mapping: ['food', 'service', 'price', 'ambience', 'anecdotes/miscellaneous']


In [51]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

print(train_encodings)

{'input_ids': [[101, 2021, 1996, 3095, 2001, 2061, 9202, 2000, 2149, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 2000, 2022, 3294, 4189, 1010, 1996, 2069, 2417, 21564, 2075, 5387, 2001, 1996, 2833, 1010, 2029, 2001, 2682, 2779, 1010, 2021, 2481, 1005, 1056, 2191, 2039, 2005, 2035, 1996, 2060, 13366, 28227, 1997, 8915, 7716, 6525, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1996, 2833, 2003, 27423, 11813, 1010, 2007, 1037, 2200, 5214, 3829, 2029, 2097, 18067, 11473, 2039, 3649, 2017, 2514, 2066, 5983, 1010, 3251, 2009, 1005, 1055, 2006, 1996, 12183, 2030, 2025, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [52]:
class RestaurantDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

In [53]:
train_dataset = RestaurantDataset(train_encodings, train_labels)
test_dataset = RestaurantDataset(test_encodings, test_labels)

print(train_dataset)

<__main__.RestaurantDataset object at 0x0000017D23887550>


In [54]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(CATEGORIES),
    problem_type='multi_label_classification'
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = (pred.predictions > 0).astype(int)
    
    f1 = f1_score(labels, preds, average='micro')
    
    return {'f1': f1}

In [56]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True
)

In [57]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [58]:
print('starting model training')
trainer.train()
print('model finished training')

starting model training


Epoch,Training Loss,Validation Loss,F1
1,0.4433,0.310234,0.756447
2,0.2199,0.206561,0.847458
3,0.1435,0.1759,0.883906
4,0.0643,0.157343,0.898102


model finished training


In [59]:
print(f'evaluating final model on the test set')
final_evaluation = trainer.evaluate()
print(f'final evaluation results: {final_evaluation}')

evaluating final model on the test set


final evaluation results: {'eval_loss': 0.1573430895805359, 'eval_f1': 0.8981018981018981, 'eval_runtime': 1.2865, 'eval_samples_per_second': 621.865, 'eval_steps_per_second': 38.867, 'epoch': 4.0}


In [61]:
model_save_path = './model'
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f'model saved to {model_save_path}')

model saved to ./model
