In [None]:
import torch
from transformers import BertTokenizer, BertModel
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Dataset

RANDOM_SEED = 69

In [None]:
# Load custom dataset from CSV
dataset = load_dataset('csv', data_files='Datasets/Cleaned/finetuning_dataset.csv')

# Access the train split
train_dataset = dataset['train']

# Perform train-test split
train_test_split = train_dataset.train_test_split(test_size=0.1)

# Access the new splits
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Inspect the splits
print(f"Training size: {len(train_dataset)}, Evaluation size: {len(eval_dataset)}")

In [None]:
# Load the dataset from CSV
dataset = load_dataset('csv', data_files='Datasets/Cleaned/finetuning_dataset.csv')['train']

# Take 10% of the dataset after shuffling
sampled_dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.1)))

# Perform train-test split
train_test_split = sampled_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Inspect the splits
print(f"Training size: {len(train_dataset)}, Evaluation size: {len(eval_dataset)}")

In [None]:
# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [5]:
import torch.nn as nn
from torch.nn import CrossEntropyLoss

class WHLA_BERT(nn.Module):
    def __init__(self, pretrained_model="bert-base-uncased", num_labels=2):
        super(WHLA_BERT, self).__init__()
        
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained(pretrained_model, output_hidden_states=True)
        self.hidden_size = self.bert.config.hidden_size
        
        # Learnable weights for H-SUM
        self.w1 = nn.Parameter(torch.tensor(1.0))
        self.w2 = nn.Parameter(torch.tensor(1.0))
        
        # Fully connected output layer
        self.fc = nn.Linear(self.hidden_size, num_labels)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, labels=2):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        hidden_states = outputs.hidden_states

        # H-SUM Aggregation
        L9 = hidden_states[-4]  # Layer 9
        L10 = hidden_states[-3]  # Layer 10
        L11 = hidden_states[-2]  # Layer 11
        L12 = hidden_states[-1]  # Layer 12

        aggregation_1 = L9 + L10
        aggregation_2 = self.w1 * aggregation_1 + L11
        final_aggregation = self.w2 * aggregation_2 + L12

        cls_representation = final_aggregation[:, 0, :]

        logits = self.fc(self.dropout(cls_representation))

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
            return {"loss": loss, "logits": logits}
        return logits

In [None]:
import torch.nn as nn
from torch.nn import CrossEntropyLoss

class WHLA_BERT_NEW(nn.Module):
    def __init__(self, pretrained_model="bert-base-uncased", num_labels=2):
        super(WHLA_BERT_NEW, self).__init__()
        
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained(pretrained_model, output_hidden_states=True)
        self.hidden_size = self.bert.config.hidden_size
        
        # Learnable weights for H-SUM
        self.w1 = nn.Parameter(torch.tensor(1.0))
        self.w2 = nn.Parameter(torch.tensor(1.0))
        
        # Fully connected output layer
        self.fc = nn.Linear(self.hidden_size, num_labels)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, labels=2):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        hidden_states = outputs.hidden_states

        # H-SUM Aggregation
        L9 = hidden_states[-4]  # Layer 9
        L10 = hidden_states[-3]  # Layer 10
        L11 = hidden_states[-2]  # Layer 11
        L12 = hidden_states[-1]  # Layer 12

        aggregation_1 = L9 + L10
        aggregation_2 = self.w1 * aggregation_1 + L11
        final_aggregation = self.w2 * aggregation_2 + L12

        cls_representation = final_aggregation[:, 0, :]

        logits = self.fc(self.dropout(cls_representation))

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
            return {"loss": loss, "logits": logits}
        return logits

In [6]:
# Load Tokenizer and Model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = WHLA_BERT(pretrained_model="bert-base-uncased", num_labels=2)

In [7]:
# Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

tokenized_train = tokenized_train.map(lambda examples: {'labels': torch.tensor(examples['polarity'], dtype=torch.long)})
tokenized_eval = tokenized_eval.map(lambda examples: {'labels': torch.tensor(examples['polarity'], dtype=torch.long)})

In [None]:
tokenized_train

In [None]:
tokenized_eval

In [None]:
# Inspect the First Row of Tokenized Train Dataset
row = tokenized_train[0]

# Print `input_ids`
print("Input IDs:", row['input_ids'])

# Decode Back to Text (Optional)
decoded_text = tokenizer.decode(row['input_ids'], skip_special_tokens=True)
print("Decoded Text:", decoded_text)

In [11]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [13]:
batch_size = 16
epochs = 2
total_steps = (len(tokenized_train) // batch_size) * epochs

In [None]:
training_args = TrainingArguments(
    output_dir="./whla_bert_results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none",
    logging_steps=500,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.evaluate()

model.save_pretrained('./WHLABert_sentiment_model')
tokenizer.save_pretrained('./WHLABert_sentiment_model')