In [8]:
import torch
from torch.utils.data import DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
)
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from typing import List, Dict

from tabulate import tabulate
from emoji import demojize
from nltk.tokenize import TweetTokenizer

from sklearn.metrics import classification_report
import json
from tqdm import tqdm

# Your provided classes and functions: MultiLabelDataCollator, TweetDataset, etc.
class MultiLabelDataCollator(DataCollatorWithPadding):
    def __init__(self, tokenizer):
        super().__init__(tokenizer)

    def __call__(self, features: List[Dict[str, torch.Tensor]]):
        batch = super().__call__(features)
        batch["labels"] = torch.stack([feature["label"] for feature in features])
        return batch
        
    @staticmethod
    def loss(logits, labels):
        # Use BCEWithLogitsLoss for multi-label classification
        loss_fct = torch.nn.BCEWithLogitsLoss()
        return loss_fct(logits, labels.float())

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= 0.5)] = 1

    report = classification_report(labels, y_pred, labels=range(len(classes)), output_dict=True)

    metrics = {
        "accuracy": np.mean(predictions == labels),
        "micro_precision": report["micro avg"]["precision"],
        "micro_recall": report["micro avg"]["recall"],
        "micro_f1": report["micro avg"]["f1-score"],
        "macro_precision": report["macro avg"]["precision"],
        "macro_recall": report["macro avg"]["recall"],
        "macro_f1": report["macro avg"]["f1-score"],
    }

    return metrics

class TweetDataset(Dataset):
    def __init__(self, x, y, mlb, tokenizer):
        self.x = x
        self.y = y
        self.mlb = mlb
        self.tokenizer = tokenizer
        self.max_length = 128
        self.encoded_tweets = self.preprocess_text(self.x)
        
    @staticmethod
    def normalizeToken(token):
        lowercased_token = token.lower()
        if token.startswith("@"):
            return "@USER"
        elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
            return "HTTPURL"
        elif len(token) == 1:
            return demojize(token)
        else:
            if token == "’":
                return "'"
            elif token == "…":
                return "..."
            else:
                return token
    
    def normalizeTweet(self, tweet):
        tokens = TweetTokenizer().tokenize(tweet.replace("’", "'").replace("…", "..."))
        normTweet = " ".join([self.normalizeToken(token) for token in tokens])

        normTweet = (
            normTweet.replace("cannot ", "can not ")
                .replace("n't ", " n't ")
                .replace("n 't ", " n't ")
                .replace("ca n't", "can't")
                .replace("ai n't", "ain't")
        )
        normTweet = (
            normTweet.replace("'m ", " 'm ")
                .replace("'re ", " 're ")
                .replace("'s ", " 's ")
                .replace("'ll ", " 'll ")
                .replace("'d ", " 'd ")
                .replace("'ve ", " 've ")
        )
        normTweet = (
            normTweet.replace(" p . m .", "  p.m.")
                .replace(" p . m ", " p.m ")
                .replace(" a . m .", " a.m.")
                .replace(" a . m ", " a.m ")
        )
        return " ".join(normTweet.split())

    def preprocess_text(self, X):
        X = [self.normalizeTweet(tweet) for tweet in X]
        
        return self.tokenizer(X, return_attention_mask=True, return_tensors='pt', padding=True, truncation = True, max_length=self.max_length)
        
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        label = self.y[idx]
        return {'input_ids': self.encoded_tweets['input_ids'][idx],
                'attention_mask': self.encoded_tweets['attention_mask'][idx],
                'label': torch.tensor(label, dtype=torch.float32)}
                #'label_ids': self.labels[idx]}


# Load the fine-tuned BERT model and tokenizer
filename = "../data/labeled_data/generic_test_0.json"
with open(filename) as f:
    data = json.load(f)

train_df = pd.DataFrame(data["train"])
val_df = pd.DataFrame(data["valid"])
test_df = pd.DataFrame(data["test"])
train_size = 100
validation_size = 100
test_size = 100
# train_size argument is used to control the size of the training set 
if train_size != "full":
    train_df = train_df.sample(n=train_size)
if validation_size != "full":
    val_df = val_df.sample(n=validation_size)
if test_size != "full":
    test_df = test_df.sample(n=test_size)

model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-large", num_labels=15, problem_type="multi_label_classification")
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

train_annotations = train_df["annotations"].tolist()

# Get all unique classes
classes = set()
for annotation in train_annotations:
    classes.update(annotation)
classes = sorted(list(classes))

# Convert the annotations to binary labels
mlb = MultiLabelBinarizer(classes=classes)
train_labels = mlb.fit_transform(train_df["annotations"])
val_labels = mlb.transform(val_df["annotations"])
test_labels = mlb.transform(test_df["annotations"])

# Initialize the datasets and DataLoader
train_dataset = TweetDataset(train_df['text'].to_list(), train_labels, mlb, tokenizer)
val_dataset = TweetDataset(val_df['text'].to_list(), val_labels, mlb, tokenizer)
test_dataset = TweetDataset(test_df['text'].to_list(), test_labels, mlb, tokenizer)
data_collator = MultiLabelDataCollator(tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, collate_fn=data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=4, collate_fn=data_collator)

# Function to get BERT embeddings using DataLoader
def get_bert_embeddings(dataloader):
    embeddings = []
    model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Processing batches"):
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            outputs = model(input_ids, attention_mask=attention_mask)
            sigmoid = torch.nn.Sigmoid()
            probs = sigmoid(torch.Tensor(outputs.logits))
            embeddings.extend(probs.numpy())
    return np.array(embeddings)

# Get the embeddings for the labeled data
train_embeddings = get_bert_embeddings(train_dataloader)
val_embeddings = get_bert_embeddings(val_dataloader)

# Calculate cosine similarity between embeddings
cosine_sim_matrix = cosine_similarity(val_embeddings, train_embeddings)

# Find the most similar labeled tweets and assign labels
predicted_labels = []
for i in range(len(val_df)):
    most_similar_index = np.argmax(cosine_sim_matrix[i])
    predicted_labels.append(train_labels[most_similar_index])

# Calculate accuracy on the validation set
val_accuracy = np.sum(np.array(predicted_labels) == val_labels) / len(val_labels)
print(f"Validation accuracy: {val_accuracy:.2f}")

No sentence-transformers model found with name /home/bruno/.var/app/com.visualstudio.code/cache/torch/sentence_transformers/vinai_bertweet-large. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/bruno/.var/app/com.visualstudio.code/cache/torch/sentence_transformers/vinai_bertweet-large were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Proc

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'sentence_transformers.readers.InputExample.InputExample'>

In [18]:
import torch
from torch.utils.data import DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
)
from sentence_transformers import SentenceTransformer, InputExample, losses

import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from typing import List, Dict

from tabulate import tabulate
from emoji import demojize
from nltk.tokenize import TweetTokenizer

from sklearn.metrics import classification_report
import json
from tqdm import tqdm

# Your provided classes and functions: MultiLabelDataCollator, TweetDataset, etc.
class MultiLabelDataCollator(DataCollatorWithPadding):
    def __init__(self, tokenizer):
        super().__init__(tokenizer)

    def __call__(self, features: List[Dict[str, torch.Tensor]]):
        batch = super().__call__(features)
        batch["labels"] = torch.stack([feature["label"] for feature in features])
        return batch
        
    @staticmethod
    def loss(logits, labels):
        # Use BCEWithLogitsLoss for multi-label classification
        loss_fct = torch.nn.BCEWithLogitsLoss()
        return loss_fct(logits, labels.float())

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= 0.5)] = 1

    report = classification_report(labels, y_pred, labels=range(len(classes)), output_dict=True)

    metrics = {
        "accuracy": np.mean(predictions == labels),
        "micro_precision": report["micro avg"]["precision"],
        "micro_recall": report["micro avg"]["recall"],
        "micro_f1": report["micro avg"]["f1-score"],
        "macro_precision": report["macro avg"]["precision"],
        "macro_recall": report["macro avg"]["recall"],
        "macro_f1": report["macro avg"]["f1-score"],
    }

    return metrics

class TweetDataset(Dataset):
    def __init__(self, x, y, mlb, tokenizer):
        self.x = x
        self.y = y
        self.mlb = mlb
        self.tokenizer = tokenizer
        self.max_length = 128
        self.encoded_tweets = self.preprocess_text(self.x)
        
    @staticmethod
    def normalizeToken(token):
        lowercased_token = token.lower()
        if token.startswith("@"):
            return "@USER"
        elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
            return "HTTPURL"
        elif len(token) == 1:
            return demojize(token)
        else:
            if token == "’":
                return "'"
            elif token == "…":
                return "..."
            else:
                return token
    
    def normalizeTweet(self, tweet):
        tokens = TweetTokenizer().tokenize(tweet.replace("’", "'").replace("…", "..."))
        normTweet = " ".join([self.normalizeToken(token) for token in tokens])

        normTweet = (
            normTweet.replace("cannot ", "can not ")
                .replace("n't ", " n't ")
                .replace("n 't ", " n't ")
                .replace("ca n't", "can't")
                .replace("ai n't", "ain't")
        )
        normTweet = (
            normTweet.replace("'m ", " 'm ")
                .replace("'re ", " 're ")
                .replace("'s ", " 's ")
                .replace("'ll ", " 'll ")
                .replace("'d ", " 'd ")
                .replace("'ve ", " 've ")
        )
        normTweet = (
            normTweet.replace(" p . m .", "  p.m.")
                .replace(" p . m ", " p.m ")
                .replace(" a . m .", " a.m.")
                .replace(" a . m ", " a.m ")
        )
        return " ".join(normTweet.split())

    def preprocess_text(self, X):
        X = [self.normalizeTweet(tweet) for tweet in X]
        
        return self.tokenizer(X, return_attention_mask=True, return_tensors='pt', padding=True, truncation = True, max_length=self.max_length)
        
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        label = self.y[idx]
        return {'input_ids': self.encoded_tweets['input_ids'][idx],
                'attention_mask': self.encoded_tweets['attention_mask'][idx],
                'label': torch.tensor(label, dtype=torch.float32)}
                #'label_ids': self.labels[idx]}


# Load the fine-tuned BERT model and tokenizer
filename = "../data/labeled_data/generic_test_0.json"
with open(filename) as f:
    data = json.load(f)

train_df = pd.DataFrame(data["train"])
val_df = pd.DataFrame(data["valid"])
test_df = pd.DataFrame(data["test"])
train_size = "full"
validation_size = "full"
test_size = "full"
# train_size argument is used to control the size of the training set 
if train_size != "full":
    train_df = train_df.sample(n=train_size)
if validation_size != "full":
    val_df = val_df.sample(n=validation_size)
if test_size != "full":
    test_df = test_df.sample(n=test_size)

model = SentenceTransformer("vinai/bertweet-large")
#tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

train_annotations = train_df["annotations"].tolist()

# Get all unique classes
classes = set()
for annotation in train_annotations:
    classes.update(annotation)
classes = sorted(list(classes))

# Convert the annotations to binary labels
mlb = MultiLabelBinarizer(classes=classes)
train_labels = mlb.fit_transform(train_df["annotations"])
val_labels = mlb.transform(val_df["annotations"])
test_labels = mlb.transform(test_df["annotations"])

train_examples = []

# Iterate through each unique label
for label_idx in range(len(classes)):
    # Create a list of indices for the tweets containing the current label
    label_indices = [idx for idx, labels in enumerate(train_labels) if labels[label_idx] == 1]

    # Iterate through the tweet indices with the current label
    for i in range(len(label_indices)):
        anchor_idx = label_indices[i]
        anchor_text = train_df['text'].iloc[anchor_idx]
        anchor_label = train_labels[anchor_idx]

        # Search for a positive example with the same label
        for j in range(len(label_indices)):
            if i == j:
                continue

            positive_idx = label_indices[j]
            positive_text = train_df['text'].iloc[positive_idx]
            positive_label = train_labels[positive_idx]

            # Check if the anchor and positive texts share the current label
            if anchor_label[label_idx] == positive_label[label_idx]:
                train_examples.append(InputExample(texts=[anchor_text, positive_text], label=1.0))
                break

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)

#train_dataloader = DataLoader(train_dataset, batch_size=4, collate_fn=data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=4, collate_fn=data_collator)

loss_function = losses.CosineSimilarityLoss(model)
# loss_function = losses.TripletLoss(model)

model.fit(train_objectives=[(train_dataloader, loss_function)], epochs=5, warmup_steps=100)

val_embeddings = model.encode(val_df['text'].to_list(), convert_to_numpy=True, show_progress_bar=True)
train_embeddings = model.encode(train_df['text'].to_list(), convert_to_numpy=True, show_progress_bar=True)

cosine_sim_matrix = cosine_similarity(val_embeddings, train_embeddings)

threshold = 0.5
predicted_labels = []

for i in range(len(val_df)):
    label_indices = np.where(cosine_sim_matrix[i] >= threshold)[0]
    label_set = set()
    for idx in label_indices:
        label_set.update(train_df['annotations'].iloc[idx])
    predicted_labels.append(list(label_set))
predicted_bin_labels = mlb.transform(predicted_labels)
report = classification_report(val_labels, predicted_bin_labels, target_names=classes, labels=range(len(classes)), output_dict=True, zero_division=0)

No sentence-transformers model found with name /home/bruno/.var/app/com.visualstudio.code/cache/torch/sentence_transformers/vinai_bertweet-large. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/bruno/.var/app/com.visualstudio.code/cache/torch/sentence_transformers/vinai_bertweet-large were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/540 [00:00<?, ?it/s]

Iteration:   0%|          | 0/540 [00:00<?, ?it/s]

Iteration:   0%|          | 0/540 [00:00<?, ?it/s]

Iteration:   0%|          | 0/540 [00:00<?, ?it/s]

Iteration:   0%|          | 0/540 [00:00<?, ?it/s]

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Batches:   0%|          | 0/100 [00:00<?, ?it/s]

In [17]:
def average_report_to_dataframe(average_report):
    data = {
        "precision": [],
        "recall": [],
        "f1-score": [],
        "support": []
    }
    index = []

    for class_name, metrics in average_report.items():
        if class_name == 'accuracy':
            continue

        index.append(class_name)
        data["precision"].append(metrics["precision"])
        data["recall"].append(metrics["recall"])
        data["f1-score"].append(metrics["f1-score"])
        data["support"].append(metrics["support"])

    return pd.DataFrame(data, index=index)

average_report_to_dataframe(report)

Unnamed: 0,precision,recall,f1-score,support
Conspiracy Theory,0.05875,1.0,0.11098,47
Education,0.015,1.0,0.029557,12
Election Campaign,0.0325,1.0,0.062954,26
Environment,0.0175,1.0,0.034398,14
Government/Public,0.315,1.0,0.479087,252
Health,0.05625,1.0,0.106509,45
Immigration/Integration,0.05,1.0,0.095238,40
Justice/Crime,0.13375,1.0,0.235943,107
Labor/Employment,0.02375,1.0,0.046398,19
Macroeconomics/Economic Regulation,0.0725,1.0,0.135198,58


In [19]:
def average_report_to_dataframe(average_report):
    data = {
        "precision": [],
        "recall": [],
        "f1-score": [],
        "support": []
    }
    index = []

    for class_name, metrics in average_report.items():
        if class_name == 'accuracy':
            continue

        index.append(class_name)
        data["precision"].append(metrics["precision"])
        data["recall"].append(metrics["recall"])
        data["f1-score"].append(metrics["f1-score"])
        data["support"].append(metrics["support"])

    return pd.DataFrame(data, index=index)

average_report_to_dataframe(report)

Unnamed: 0,precision,recall,f1-score,support
Conspiracy Theory,0.05875,1.0,0.11098,47
Education,0.015,1.0,0.029557,12
Election Campaign,0.0325,1.0,0.062954,26
Environment,0.0175,1.0,0.034398,14
Government/Public,0.315,1.0,0.479087,252
Health,0.05625,1.0,0.106509,45
Immigration/Integration,0.05,1.0,0.095238,40
Justice/Crime,0.13375,1.0,0.235943,107
Labor/Employment,0.02375,1.0,0.046398,19
Macroeconomics/Economic Regulation,0.0725,1.0,0.135198,58


In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import torch
import torch.nn as nn
from transformers import AutoModel

class SiameseTextDataset(Dataset):
    def __init__(self, examples, tokenizer, max_length=128):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]
        text1, text2 = example.texts
        label = example.label

        encoding1 = self.tokenizer(text1, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        encoding2 = self.tokenizer(text2, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)

        return {
            'input_ids1': encoding1['input_ids'].squeeze(0),
            'attention_mask1': encoding1['attention_mask'].squeeze(0),
            'input_ids2': encoding2['input_ids'].squeeze(0),
            'attention_mask2': encoding2['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.float)
        }

# Instantiate the tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

# Create the SiameseTextDataset
siamese_train_dataset = SiameseTextDataset(train_examples, tokenizer)

# Create the DataLoader
train_dataloader = DataLoader(siamese_train_dataset, batch_size=16, shuffle=True)


class SiameseBERT(nn.Module):
    def __init__(self, model_name):
        super(SiameseBERT, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        output1 = self.bert(input_ids=input_ids1, attention_mask=attention_mask1)
        output2 = self.bert(input_ids=input_ids2, attention_mask=attention_mask2)
        return output1, output2

class ContrastiveLoss(nn.Module):
    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, labels):
        distances = 1 - torch.nn.functional.cosine_similarity(output1, output2)
        loss = (labels * distances.pow(2) + (1 - labels) * torch.clamp(self.margin - distances, min=0).pow(2)).mean()
        return loss
    

train_examples = []

# Instantiate the SiameseBERT model and the ContrastiveLoss
siamese_model = SiameseBERT("vinai/bertweet-large")
contrastive_loss = ContrastiveLoss()

# Set up the training loop
optimizer = torch.optim.Adam(siamese_model.parameters(), lr=5e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
siamese_model = siamese_model.to(device)
num_epochs = 3

for epoch in range(num_epochs):
    siamese_model.train()
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids1, attention_mask1 = batch['input_ids1'].to(device), batch['attention_mask1'].to(device)
        input_ids2, attention_mask2 = batch['input_ids2'].to(device), batch['attention_mask2'].to(device)
        labels = batch['label'].to(device)

        output1, output2 = siamese_model(input_ids1, attention_mask1, input_ids2, attention_mask2)
        loss = contrastive_loss(output1.pooler_output, output2.pooler_output, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch: {epoch + 1}, Loss: {total_loss / len(train_dataloader)}")


NameError: name 'train_examples' is not defined

In [None]:
for i in range(len(val_df)):
    label_indices = np.where(cosine_sim_matrix[i] >= threshold)[0]
    label_set = set()
    for idx in label_indices:
        label_set.update(train_df['annotations'].iloc[idx])
    predicted_labels.append(list(label_set))