In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame(data = {"lbl":[[1], [1,0], [0], [4], [3], [3,0]], 
                          "text": ['Đây là bệnh về da',
                                   'Da thâm nổi mụm và tiền ung thư',
                                   'Nhóm bệnh ung thư về sắc tố',
                                   'Cơ xương có thể không phát triển',
                                   'Gan nhiễm mỡ và có dấu hiệu xơ gan',
                                   'Xơ gan giai đoạn tiền phát nguy cơ gây ung thư gan']})
df

In [None]:
def binary_cvt(labels, max_val=4):
    zeros_arr = [0]*(max_val+1)
    for label in labels:
        zeros_arr[label] = 1
    return zeros_arr


df['binary_lbl'] = df.lbl.apply(lambda x: binary_cvt(x))
df.head()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CrossEntropyLossMultiLabel(nn.Module):
    ''' 
    Cross Entropy Loss for Multi-Label Classification
    
    This class defines the Cross Entropy Loss for addressing multi-label classification tasks.
    It uses PyTorch's built-in CrossEntropyLoss, adjusted for multi-label.

    Attributes:
        loss_fn (torch.nn.CrossEntropyLoss): PyTorch Cross Entropy Loss instance.
    '''

    def __init__(self):
        super(CrossEntropyLossMultiLabel, self).__init__()

    def forward(self, logits, labels):
        ''' 
        Forward pass for Cross Entropy Loss.

        Args:
            logits (torch.Tensor): Logits predicted by the model.
            labels (torch.Tensor): True labels.

        Returns:
            torch.Tensor: Computed Cross Entropy Loss.
        '''
        # Apply sigmoid activation to logits for multi-label classification
        logits_sigmoid = torch.sigmoid(logits)

        # Flatten the logits and labels for multi-label loss calculation
        logits_flat = logits_sigmoid.view(-1)
        labels_flat = labels.view(-1)

        # Binary cross entropy loss
        loss = F.binary_cross_entropy(logits_flat, labels_flat)

        return loss
    
class FocalLossMultiLabel(nn.Module):
    ''' 
    Focal Loss for Multi-Label Classification
    
    This class defines the Focal Loss for addressing class imbalance in multi-label classification tasks.
    It introduces a modulating factor (gamma) to down-weight easy samples.

    Attributes:
        gamma (float): Modulating factor for Focal Loss.
    '''

    def __init__(self, gamma=2.0):
        super(FocalLossMultiLabel, self).__init__()
        self.gamma = gamma

    def forward(self, outputs, labels):
        ''' 
        Forward pass for Focal Loss.

        Args:
            outputs (torch.Tensor): Raw outputs from the model.
            labels (torch.Tensor): True labels.

        Returns:
            torch.Tensor: Computed Focal Loss.
        '''
        ce_loss = F.binary_cross_entropy_with_logits(outputs, labels, reduction='none')
        pt = torch.exp(-ce_loss)
        loss = (1 - pt) ** self.gamma * ce_loss
        return loss.mean()
    
class FocalLossWithBatchNormL2MultiLabel(nn.Module):
    ''' 
    Focal Loss with BatchNorm L2 Penalty for Multi-Label Classification
    
    This class defines Focal Loss with an additional BatchNorm L2 penalty for multi-label classification.
    It helps prevent overfitting by penalizing large weights in BatchNorm layers.

    Attributes:
        gamma (float): Modulating factor for Focal Loss.
        beta (float): Coefficient for BatchNorm L2 penalty.
    '''

    def __init__(self, gamma=2.0, beta=1e-4):
        super(FocalLossWithBatchNormL2MultiLabel, self).__init__()
        self.gamma = gamma
        self.beta = beta

    def forward(self, outputs, labels):
        ''' 
        Forward pass for Focal Loss with BatchNorm L2 Penalty.

        Args:
            outputs (torch.Tensor): Raw outputs from the model.
            labels (torch.Tensor): True labels.

        Returns:
            torch.Tensor: Computed Focal Loss with BatchNorm L2 Penalty.
        '''
        ce_loss = F.binary_cross_entropy_with_logits(outputs, labels, reduction='none')
        pt = torch.exp(-ce_loss)
        loss = (1 - pt) ** self.gamma * ce_loss
        return loss.mean() + self.beta * self.batch_norm_l2_penalty()

    def batch_norm_l2_penalty(self):
        ''' 
        Compute BatchNorm L2 Penalty.

        Returns:
            torch.Tensor: L2 penalty for BatchNorm layers.
        '''
        l2_penalty = torch.tensor(0.0, requires_grad=True)
        for module in self.modules():
            if isinstance(module, nn.BatchNorm2d):
                l2_penalty += (module.weight ** 2).sum()
        return l2_penalty
    
class LabelSmoothingLossMultiLabel(nn.Module):
    ''' 
    Label Smoothing Loss for Multi-Label Classification
    
    This class defines the Label Smoothing Loss for addressing multi-label classification tasks.
    It mitigates overconfidence in the model predictions by introducing label smoothing.

    Attributes:
        smoothing (float): Smoothing factor for label smoothing.
    '''

    def __init__(self, smoothing=0.1):
        super(LabelSmoothingLossMultiLabel, self).__init__()
        self.smoothing = smoothing

    def forward(self, outputs, labels):
        ''' 
        Forward pass for Label Smoothing Loss.

        Args:
            outputs (torch.Tensor): Logits predicted by the model.
            labels (torch.Tensor): True labels.

        Returns:
            torch.Tensor: Computed Label Smoothing Loss.
        '''
        sigmoid_outputs = torch.sigmoid(outputs)

        smooth_labels = (1.0 - self.smoothing) * labels + self.smoothing / 2.0
        log_probs = torch.log(sigmoid_outputs)

        loss = -torch.sum(smooth_labels * log_probs + (1.0 - smooth_labels) * torch.log(1.0 - sigmoid_outputs))
        return loss / outputs.size(0)  # Normalize by batch size

In [None]:
df = pd.read_csv('../Medical-Abstracts-TC-Corpus/preprocessed-medical_tc_train.csv')
df.head()

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from torch.nn import BCEWithLogitsLoss  # For multi-label classification


In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

# Chuẩn bị dữ liệu
texts = df.medical_abstract.tolist()
labels = torch.tensor(df[['neoplasms','digestive','nervous','cardiovascular','general']].values)
# Token hóa văn bản
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Tạo dataset
class CustomDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.tokenized_texts = tokenized_texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {'input_ids': self.tokenized_texts['input_ids'][idx], 'attention_mask': self.tokenized_texts['attention_mask'][idx], 'labels': self.labels[idx]}

dataset = CustomDataset(tokenized_texts, labels)

# Tạo DataLoader
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Tạo mô hình BERT
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Hàm mất mát BCEWithLogitsLoss
# criterion = nn.BCEWithLogitsLoss()
criterion = CrossEntropyLossMultiLabel()
# criterion = nn.CrossEntropyLoss()

# Tối ưu hóa
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Quá trình huấn luyện
num_epochs = 5
for epoch in range(num_epochs):
    for batch in dataloader:
        inputs = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        # Tính logits từ mô hình
        outputs = model(inputs, attention_mask=attention_mask)
        logits = outputs.logits
        
        print(logits)
        # Tính toán loss
        loss = criterion(logits, labels.float())
        print(loss)
        # Backpropagation và tối ưu hóa
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Đánh giá mô hình (tương tự trong quá trình huấn luyện)
# ...


In [None]:
import numpy as np

def calculate_metrics(true_labels, predicted_labels, threshold=0.5, report_methods=['micro', 'macro']):
    # Convert probability scores to binary predictions based on the threshold
    binary_predictions = (predicted_labels > threshold).astype(int)

    metrics = {}

    if 'micro' in report_methods:
        # Micro-Averaging: Calculate metrics globally across all classes
        tp = np.sum((true_labels == 1) & (binary_predictions == 1))
        fp = np.sum((true_labels == 0) & (binary_predictions == 1))
        fn = np.sum((true_labels == 1) & (binary_predictions == 0))
        tn = np.sum((true_labels == 0) & (binary_predictions == 0))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0

        metrics['precision'] = precision
        metrics['recall'] = recall
        metrics['f1_score'] = f1_score
        metrics['accuracy'] = accuracy

    if 'macro' in report_methods:
        # Macro-Averaging: Calculate metrics independently for each class and then average
        num_classes = true_labels.shape[1]
        precision_macro = recall_macro = f1_score_macro = accuracy_macro = 0

        for class_idx in range(num_classes):
            tp = np.sum((true_labels[:, class_idx] == 1) & (binary_predictions[:, class_idx] == 1))
            fp = np.sum((true_labels[:, class_idx] == 0) & (binary_predictions[:, class_idx] == 1))
            fn = np.sum((true_labels[:, class_idx] == 1) & (binary_predictions[:, class_idx] == 0))
            tn = np.sum((true_labels[:, class_idx] == 0) & (binary_predictions[:, class_idx] == 0))

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0

            precision_macro += precision
            recall_macro += recall
            f1_score_macro += f1_score
            accuracy_macro += accuracy

        precision_macro /= num_classes
        recall_macro /= num_classes
        f1_score_macro /= num_classes
        accuracy_macro /= num_classes

        metrics['precision_macro'] = precision_macro
        metrics['recall_macro'] = recall_macro
        metrics['f1_score_macro'] = f1_score_macro
        metrics['accuracy_macro'] = accuracy_macro

    return metrics

In [11]:
import torch

# Sample true labels
true_labels = torch.tensor([[0., 1., 0., 0., 0.],
                            [0., 0., 0., 1., 0.],
                            [0., 0., 0., 1., 1.],
                            [0., 0., 1., 1., 0.],
                            [1., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 1.],
                            [0., 0., 0., 0., 1.],
                            [0., 0., 0., 1., 0.]])

# Sample predicted labels (binary predictions)
predicted_labels = torch.tensor([[0., 1., 0., 0., 0.],
                                 [1., 0., 0., 1., 0.],
                                 [0., 0., 0., 1., 1.],
                                 [0., 0., 1., 1., 0.],
                                 [1., 0., 0., 0., 0.],
                                 [0., 1., 0., 0., 1.],
                                 [1., 1., 1., 1., 1.],
                                 [0., 0., 0., 1., 0.]])

# Flatten the tensors for scikit-learn compatibility
true_labels_flat = true_labels.numpy()
predicted_labels_flat = predicted_labels.numpy()

micro_f1 = calculate_metrics(true_labels_flat, predicted_labels_flat, report_methods='macro')

print("Micro F1 Score:", micro_f1)
# Micro F1 Score: 0.7692307692307693
# Micro F1 Score: {'precision': 0.625, 'recall': 1.0, 'f1_score': 0.7692307692307693, 'accuracy': 0.85}

Micro F1 Score: {'precision_macro': 0.5933333333333334, 'recall_macro': 1.0, 'f1_score_macro': 0.711111111111111, 'accuracy_macro': 0.85}


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np
import torch

# Assuming you have batches of true labels and predicted probabilities
# Sample true labels for two batches
true_labels_batch1 = torch.tensor([[0., 1., 0., 0., 0.],
                                   [0., 0., 0., 1., 0.]])

true_labels_batch2 = torch.tensor([[0., 0., 0., 1., 1.],
                                   [0., 0., 1., 1., 0.]])

# Sample predicted probabilities for two batches
predicted_probs_batch1 = torch.tensor([[0.1, 0.9, 0.3, 0.6, 0.7],
                                       [0.8, 0.2, 0.4, 0.7, 0.1]])

predicted_probs_batch2 = torch.tensor([[0.2, 0.4, 0.5, 0.6, 0.9],
                                       [0.3, 0.7, 0.8, 0.2, 0.6]])

# Flatten the tensors for scikit-learn compatibility
true_labels_all = torch.cat([true_labels_batch1, true_labels_batch2], dim=0).view(-1).numpy()
predicted_probs_all = torch.cat([predicted_probs_batch1, predicted_probs_batch2], dim=0).view(-1).numpy()

# Compute ROC curve
fpr, tpr, thresholds = roc_curve(true_labels_all, predicted_probs_all)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


In [None]:
a1 = np.array([1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.], dtype=np.int32)

In [None]:
a2 = np.array([0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0.], dtype=np.int32)

In [None]:
acc = 26/40
acc

In [None]:
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(a1, a2)
accuracy

In [None]:
o=0
for i in zip(a1, a2):
    if i[0] == i[1]:
        o+=1
o

In [None]:
import numpy as np

a1 = np.array([1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.], dtype=np.int32)
a2 = np.array([0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0.], dtype=np.int32)



In [None]:
a3 = np.concatenate([a1,a2])
a3

In [None]:
len(a3)

In [None]:
import torch
import torch.nn.functional as F

# Given matrix
matrix = torch.tensor([[-0.8426, 0.4689, -0.8778, 0.3294, -0.3936],
                       [-0.0423, 0.6215, -0.2281, 0.2345, -0.3712]])

lbl_true = torch.tensor([[0,0,1,0,0],
                         [0,1,0,1,0]])



In [None]:
import torch

# Create an empty tensor
empty_tensor = torch.empty(0)  # Assuming 5 columns, adjust as needed

# Create a new tensor
new_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0])

# Concatenate the empty tensor with the new tensor along dimension 0
empty_tensor = torch.cat([empty_tensor, new_tensor],dim=0)

print("\nConcatenated Tensor:")
print(empty_tensor)


empty_tensor = torch.cat([empty_tensor, new_tensor])

print("\nConcatenated Tensor:")
print(empty_tensor)


In [None]:
import torch
import torch.nn.functional as F

# Given matrix
matrix = torch.tensor([[-0.8426, 0.4689, 0.8778, 0.3294, -0.3936],
                       [-0.00423, 0.8, -0.02281, 0.9, -0.03712]])

# True labels
lbl_true = torch.tensor([[0, 0, 1, 0, 0],
                         [0, 1, 0, 1, 0]])

# Apply sigmoid activation to the matrix
matrix_sigmoid = torch.sigmoid(matrix)

# Calculate BCEWithLogitsLoss
loss = F.binary_cross_entropy_with_logits(matrix_sigmoid, lbl_true.float())

print("BCEWithLogitsLoss:")
print(loss.item())


In [None]:
import pandas as pd

# Your original dataset
original_data = {'Column1': [9, 9, 9, 9],
                 'Column2': [9, 9, 9, 8]
                }

# Create a DataFrame from your original dataset
df = pd.DataFrame(original_data)

# Your list of values
LB = [[0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 1]]

# Convert the list of values to a DataFrame
LB_df = pd.DataFrame(LB, columns=['A', 'B', 'C', 'D'])

# Concatenate the original DataFrame with the new DataFrame
result_df = pd.concat([df, LB_df], axis=1)

# Display the result
print(result_df)
