In [16]:
import pandas as pd
import numpy as np
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import json
import time
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [13]:
# Define a list of filenames to load
filenames = ["../data/labeled_data/generic_test_0.json"]

# Load all JSON data and concatenate into one DataFrame
dfs = []
for filename in filenames:
    with open(filename) as f:
        data = json.load(f)
    df = pd.DataFrame(data["train"])
    dfs.append(df)
df_all = pd.concat(dfs)

In [14]:
df_all = df_all.reset_index()

In [15]:
class TweetDataset(Dataset):
    def __init__(self, df, y):
        self.input_ids = df['input_ids'].tolist()
        self.attention_mask = df['attention_mask']
        self.labels = y

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attention_mask[idx], self.labels[idx]

In [28]:
# Convert annotations column to a list of labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df_all['annotations'])

# Load BERTweet model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large", normalization=True)
model = AutoModel.from_pretrained("vinai/bertweet-large")

# Preprocess the text column by encoding the tweets and adding special tokens
def preprocess_text(text):
    return tokenizer.encode_plus(text, max_length=128, add_special_tokens=True, truncation=True, padding='max_length', return_attention_mask=True, return_tensors='pt')

encoded_tweets = df_all['text'].apply(preprocess_text)
input_ids = encoded_tweets.apply(lambda x: x['input_ids']).to_list()
attention_mask = encoded_tweets.apply(lambda x: x['attention_mask']).to_list()
df_encoded_tweets = pd.DataFrame({'input_ids': input_ids, 'attention_mask': attention_mask})

# Split the data into training and testing sets
encoded_tweets_train, encoded_tweets_test, y_train, y_test = train_test_split(df_encoded_tweets, y, test_size=0.2, random_state=42)

# Define PyTorch dataset and dataloader for the training and testing sets
train_dataset = TweetDataset(encoded_tweets_train.reset_index(), torch.tensor(y_train))
test_dataset = TweetDataset(encoded_tweets_test.reset_index(), torch.tensor(y_test))

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Define the classifier model
class TweetClassifier(nn.Module):
    def __init__(self, num_labels):
        super(TweetClassifier, self).__init__()
        self.bertweet = AutoModel.from_pretrained("vinai/bertweet-large")
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.bertweet.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        output = self.bertweet(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = output.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits

# Train the classifier model using binary cross-entropy loss and the AdamW optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TweetClassifier(num_labels=len(mlb.classes_)).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.00001)

# Initialize dataframes for logging the losses and metrics
metrics = pd.DataFrame(columns=['epoch', 'train_loss', 'test_loss', 'precision_macro', 'precision_micro', 'recall_macro', 'recall_micro', 'f1_macro', 'f1_micro', 'accuracy', 'time'])


epochs = 10
scaler = torch.cuda.amp.GradScaler()
for epoch in range(epochs):
    start_time = time.time()
    model.train()
    train_loss = 0
    for batch in train_loader:
        input_ids = batch[0].to(device).squeeze(1)
        attention_mask = batch[1].to(device).squeeze(1)
        labels = batch[2].to(device)
        optimizer.zero_grad()
        
        # Casts operations to mixed precision
        with torch.cuda.amp.autocast():
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels.float())
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        train_loss += loss.item()

    train_loss /= len(train_loader)

    # Compute metrics
    y_true = []
    y_pred = []
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch[0].to(device).squeeze(1)
            attention_mask = batch[1].to(device).squeeze(1)
            labels = batch[2].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels.float())
            test_loss += loss.item()
            
            batch_pred = torch.sigmoid(outputs).cpu().detach().numpy()
            labels = labels.cpu().detach().numpy()
            y_true.append(labels)
            y_pred.append(batch_pred)

    test_loss /= len(test_loader)

    y_true = np.vstack(y_true)
    y_pred = np.vstack(y_pred)
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(y_true, y_pred > 0.5, average='macro')
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(y_true, y_pred > 0.5, average='micro')
    accuracy = accuracy_score(y_true, y_pred > 0.5)

    elapsed_time = time.time() - start_time
    minutes, seconds = divmod(elapsed_time, 60)
    time_str = f"{int(minutes):02d}:{int(seconds):02d}"

    metrics = metrics.append({'epoch': epoch+1, 'train_loss': train_loss, 'test_loss': test_loss, 'precision_macro': precision_macro, 'precision_micro': precision_micro, 'recall_macro': recall_macro, 'recall_micro': recall_micro, 'f1_macro': f1_macro, 'f1_micro': f1_micro, 'accuracy': accuracy, 'time': time_str}, ignore_index=True)

    print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss}, Test Loss: {test_loss}, Precision Macro: {precision_macro}, Precision Micro: {precision_micro}, Recall Macro: {recall_macro}, Recall Micro: {recall_micro}, F1 Macro: {f1_macro}, F1 Micro: {f1_micro}, Accuracy: {accuracy}, Time: {time_str}')

Some weights of the model checkpoint at vinai/bertweet-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

Epoch 1/10, Train Loss: 0.2164692996710073, Test Loss: 0.15293932761996984, Precision Macro: 0.4879250176142039, Precision Micro: 0.8409893992932862, Recall Macro: 0.26616858451345393, Recall Micro: 0.544, F1 Macro: 0.3213357119180647, F1 Micro: 0.6606523247744622, Accuracy: 0.475, Time: 02:12


  _warn_prf(average, modifier, msg_start, len(result))
  metrics = metrics.append({'epoch': epoch+1, 'train_loss': train_loss, 'test_loss': test_loss, 'precision_macro': precision_macro, 'precision_micro': precision_micro, 'recall_macro': recall_macro, 'recall_micro': recall_micro, 'f1_macro': f1_macro, 'f1_micro': f1_micro, 'accuracy': accuracy, 'time': time_str}, ignore_index=True)


Epoch 2/10, Train Loss: 0.12726863578427583, Test Loss: 0.12193588002119213, Precision Macro: 0.8103344813242532, Precision Micro: 0.8491379310344828, Recall Macro: 0.46708588661627276, Recall Micro: 0.6754285714285714, F1 Macro: 0.5509046151097298, F1 Micro: 0.7523870146403564, Accuracy: 0.584375, Time: 02:10


  _warn_prf(average, modifier, msg_start, len(result))
  metrics = metrics.append({'epoch': epoch+1, 'train_loss': train_loss, 'test_loss': test_loss, 'precision_macro': precision_macro, 'precision_micro': precision_micro, 'recall_macro': recall_macro, 'recall_micro': recall_micro, 'f1_macro': f1_macro, 'f1_micro': f1_micro, 'accuracy': accuracy, 'time': time_str}, ignore_index=True)


Epoch 3/10, Train Loss: 0.08981120531097986, Test Loss: 0.10956695926142856, Precision Macro: 0.7683050502646296, Precision Micro: 0.8604972375690608, Recall Macro: 0.5786442966645003, Recall Micro: 0.712, F1 Macro: 0.6376842185362077, F1 Micro: 0.7792370231394622, Accuracy: 0.615625, Time: 02:09


  metrics = metrics.append({'epoch': epoch+1, 'train_loss': train_loss, 'test_loss': test_loss, 'precision_macro': precision_macro, 'precision_micro': precision_micro, 'recall_macro': recall_macro, 'recall_micro': recall_micro, 'f1_macro': f1_macro, 'f1_micro': f1_micro, 'accuracy': accuracy, 'time': time_str}, ignore_index=True)


Epoch 4/10, Train Loss: 0.06516935581021244, Test Loss: 0.1098371872853022, Precision Macro: 0.7795099011119713, Precision Micro: 0.8282828282828283, Recall Macro: 0.6208066901483922, Recall Micro: 0.7497142857142857, F1 Macro: 0.6744151080818109, F1 Micro: 0.7870425914817037, Accuracy: 0.63125, Time: 02:12


  metrics = metrics.append({'epoch': epoch+1, 'train_loss': train_loss, 'test_loss': test_loss, 'precision_macro': precision_macro, 'precision_micro': precision_micro, 'recall_macro': recall_macro, 'recall_micro': recall_micro, 'f1_macro': f1_macro, 'f1_micro': f1_micro, 'accuracy': accuracy, 'time': time_str}, ignore_index=True)


Epoch 5/10, Train Loss: 0.048519801546353845, Test Loss: 0.10636279932223261, Precision Macro: 0.7662071006245125, Precision Micro: 0.8245614035087719, Recall Macro: 0.6357917212226627, Recall Micro: 0.752, F1 Macro: 0.6754346680449225, F1 Micro: 0.7866108786610879, Accuracy: 0.63125, Time: 02:09


  metrics = metrics.append({'epoch': epoch+1, 'train_loss': train_loss, 'test_loss': test_loss, 'precision_macro': precision_macro, 'precision_micro': precision_micro, 'recall_macro': recall_macro, 'recall_micro': recall_micro, 'f1_macro': f1_macro, 'f1_micro': f1_micro, 'accuracy': accuracy, 'time': time_str}, ignore_index=True)


Epoch 6/10, Train Loss: 0.03402524955381523, Test Loss: 0.10633800378127489, Precision Macro: 0.7592187049500522, Precision Micro: 0.8111380145278451, Recall Macro: 0.6597960176486956, Recall Micro: 0.7657142857142857, F1 Macro: 0.6945940776849591, F1 Micro: 0.7877718988830099, Accuracy: 0.63125, Time: 02:07


  metrics = metrics.append({'epoch': epoch+1, 'train_loss': train_loss, 'test_loss': test_loss, 'precision_macro': precision_macro, 'precision_micro': precision_micro, 'recall_macro': recall_macro, 'recall_micro': recall_micro, 'f1_macro': f1_macro, 'f1_micro': f1_micro, 'accuracy': accuracy, 'time': time_str}, ignore_index=True)


Epoch 7/10, Train Loss: 0.026677896947876433, Test Loss: 0.11455364401626866, Precision Macro: 0.7690008411945205, Precision Micro: 0.8011904761904762, Recall Macro: 0.6408512267569427, Recall Micro: 0.7691428571428571, F1 Macro: 0.6740942998536028, F1 Micro: 0.7848396501457725, Accuracy: 0.6203125, Time: 02:07


  metrics = metrics.append({'epoch': epoch+1, 'train_loss': train_loss, 'test_loss': test_loss, 'precision_macro': precision_macro, 'precision_micro': precision_micro, 'recall_macro': recall_macro, 'recall_micro': recall_micro, 'f1_macro': f1_macro, 'f1_micro': f1_micro, 'accuracy': accuracy, 'time': time_str}, ignore_index=True)


Epoch 8/10, Train Loss: 0.020222024329268607, Test Loss: 0.12508483310084556, Precision Macro: 0.7577328093710255, Precision Micro: 0.8114143920595533, Recall Macro: 0.6332493756211833, Recall Micro: 0.7474285714285714, F1 Macro: 0.667265246489623, F1 Micro: 0.778108268887567, Accuracy: 0.625, Time: 02:09


  metrics = metrics.append({'epoch': epoch+1, 'train_loss': train_loss, 'test_loss': test_loss, 'precision_macro': precision_macro, 'precision_micro': precision_micro, 'recall_macro': recall_macro, 'recall_micro': recall_micro, 'f1_macro': f1_macro, 'f1_micro': f1_micro, 'accuracy': accuracy, 'time': time_str}, ignore_index=True)


Epoch 9/10, Train Loss: 0.019147554544179, Test Loss: 0.12257283843646291, Precision Macro: 0.738763109795947, Precision Micro: 0.7997698504027618, Recall Macro: 0.67241680357684, Recall Micro: 0.7942857142857143, F1 Macro: 0.6924258315791785, F1 Micro: 0.7970183486238532, Accuracy: 0.6390625, Time: 02:09
Epoch 10/10, Train Loss: 0.019440196765572182, Test Loss: 0.1222905451519182, Precision Macro: 0.7500670449227063, Precision Micro: 0.8037383177570093, Recall Macro: 0.6765097267115059, Recall Micro: 0.7862857142857143, F1 Macro: 0.6993002125452678, F1 Micro: 0.7949162333911033, Accuracy: 0.6390625, Time: 02:08


  metrics = metrics.append({'epoch': epoch+1, 'train_loss': train_loss, 'test_loss': test_loss, 'precision_macro': precision_macro, 'precision_micro': precision_micro, 'recall_macro': recall_macro, 'recall_micro': recall_micro, 'f1_macro': f1_macro, 'f1_micro': f1_micro, 'accuracy': accuracy, 'time': time_str}, ignore_index=True)


In [29]:
metrics

Unnamed: 0,epoch,train_loss,test_loss,precision_macro,precision_micro,recall_macro,recall_micro,f1_macro,f1_micro,accuracy,time
0,1,0.216469,0.152939,0.487925,0.840989,0.266169,0.544,0.321336,0.660652,0.475,02:12
1,2,0.127269,0.121936,0.810334,0.849138,0.467086,0.675429,0.550905,0.752387,0.584375,02:10
2,3,0.089811,0.109567,0.768305,0.860497,0.578644,0.712,0.637684,0.779237,0.615625,02:09
3,4,0.065169,0.109837,0.77951,0.828283,0.620807,0.749714,0.674415,0.787043,0.63125,02:12
4,5,0.04852,0.106363,0.766207,0.824561,0.635792,0.752,0.675435,0.786611,0.63125,02:09
5,6,0.034025,0.106338,0.759219,0.811138,0.659796,0.765714,0.694594,0.787772,0.63125,02:07
6,7,0.026678,0.114554,0.769001,0.80119,0.640851,0.769143,0.674094,0.78484,0.620313,02:07
7,8,0.020222,0.125085,0.757733,0.811414,0.633249,0.747429,0.667265,0.778108,0.625,02:09
8,9,0.019148,0.122573,0.738763,0.79977,0.672417,0.794286,0.692426,0.797018,0.639062,02:09
9,10,0.01944,0.122291,0.750067,0.803738,0.67651,0.786286,0.6993,0.794916,0.639062,02:08


In [32]:
folder_path = "../models"
model_name = "bertweet_large_mlb.pt"

import os
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
torch.save(model.state_dict(), os.path.join(folder_path, model_name))