In [21]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch as nn
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [2]:
df = pd.read_csv("Tweets.csv")
df = df[['text', 'airline_sentiment']].dropna()
df = df[df['airline_sentiment'].isin(['positive', 'neutral', 'negative'])]

In [3]:
label2id = {'negative': 0, 'neutral': 1, 'positive': 2}
id2label = {v: k for k, v in label2id.items()}
df['label'] = df['airline_sentiment'].map(label2id)

dataset = Dataset.from_pandas(df[['text', 'label']])
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 14640/14640 [00:05<00:00, 2549.12 examples/s]


In [4]:
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [5]:

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1
)

In [7]:
def compute_metrics(p):
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [8]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4505,0.436483,0.843921,0.842135,0.841057,0.843921




TrainOutput(global_step=1464, training_loss=0.48575050062169145, metrics={'train_runtime': 336454.079, 'train_samples_per_second': 0.035, 'train_steps_per_second': 0.004, 'total_flos': 3081584348430336.0, 'train_loss': 0.48575050062169145, 'epoch': 1.0})

In [9]:
model.save_pretrained("./modelo_finetunado")
tokenizer.save_pretrained("./modelo_finetunado")

('./modelo_finetunado\\tokenizer_config.json',
 './modelo_finetunado\\special_tokens_map.json',
 './modelo_finetunado\\vocab.txt',
 './modelo_finetunado\\added_tokens.json')

In [13]:
# Avaliação direta no conjunto de teste
metrics = trainer.evaluate(eval_dataset=test_dataset)
metrics




{'eval_loss': 0.43648290634155273,
 'eval_accuracy': 0.8439207650273224,
 'eval_f1': 0.8421350408484252,
 'eval_precision': 0.8410566604770285,
 'eval_recall': 0.8439207650273224,
 'eval_runtime': 1796.9017,
 'eval_samples_per_second': 1.629,
 'eval_steps_per_second': 0.204,
 'epoch': 1.0}

In [15]:
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(axis=1)
y_true = predictions.label_ids

# Relatório
print(classification_report(y_true, y_pred, target_names=["negative", "neutral", "positive"]))



              precision    recall  f1-score   support

    negative       0.90      0.92      0.91      1830
     neutral       0.71      0.67      0.69       621
    positive       0.80      0.76      0.78       477

    accuracy                           0.84      2928
   macro avg       0.80      0.79      0.79      2928
weighted avg       0.84      0.84      0.84      2928



In [None]:
train_labels = train_dataset['label']

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)
class_weights = nn.tensor(class_weights, dtype=nn.float)

class_weights

tensor([0.5313, 1.5755, 2.0700])

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    outputs = model(**inputs)
    logits = outputs.get("logits")
    loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
    loss = loss_fct(logits, labels)
    return (loss, outputs) if return_outputs else loss

model.compute_loss = compute_loss

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
