In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

RANDOM_SEED = 69
TOKEN_LIMIT = 400
TORCH_SEED = 69

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
torch.manual_seed(TORCH_SEED)

In [None]:
data = pd.read_csv('finetuning_dataset.csv')
twitter_data = data[data['source'] == 'Twitter'].sample(frac=0.3, random_state=RANDOM_SEED)
other_data = data[data['source'] != 'Twitter']
sampled_data = pd.concat([twitter_data, other_data])

print(sampled_data['source'].value_counts().sum())

In [None]:
dataset = Dataset.from_pandas(sampled_data)
dataset

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(data):
    return tokenizer(data['text'], truncation=True, max_length=TOKEN_LIMIT)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
df = tokenized_dataset.to_pandas()

train_df, eval_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['source'],
    random_state=RANDOM_SEED
)

train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

train_dataset = train_dataset.map(lambda row: {'labels': row['polarity']})
eval_dataset = eval_dataset.map(lambda row: {'labels': row['polarity']})

print(f"Training size: {len(train_dataset)}, Evaluation size: {len(eval_dataset)}")

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
batch_size = 16
epochs = 1
total_steps = (len(train_dataset) // batch_size) * epochs

# Base BERT model

In [None]:
base_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
training_args = TrainingArguments(
    output_dir="./base_model_results",
    eval_strategy="epoch",
    # eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="none",
    logging_steps=500,
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

trainer.train()

trainer.evaluate()

base_model.save_pretrained('./base_model')
tokenizer.save_pretrained('./base_model')

# Base BERT additional training ( 2 more epochs )

In [None]:
batch_size = 16
epochs = 2
total_steps = (len(train_dataset) // batch_size) * epochs

In [None]:
training_args = TrainingArguments(
    output_dir="./base_model_additional_trg_results",
    eval_strategy="epoch",
    # eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="none",
    logging_steps=500,
)

trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

trainer.train()

trainer.evaluate()

base_model.save_pretrained('./base_model_additional_trg')
tokenizer.save_pretrained('./base_model_additional_trg')

# WHLA BERT

In [None]:
from torch.nn import CrossEntropyLoss
from transformers import BertModel
import torch.nn as nn

class WHLA_BERT(nn.Module):
    def __init__(self, pretrained_model="bert-base-uncased", num_labels=2):
        super(WHLA_BERT, self).__init__()

        self.bert = BertModel.from_pretrained(pretrained_model, output_hidden_states=True)
        self.hidden_size = self.bert.config.hidden_size

        self.gates = nn.Parameter(torch.ones(4))
        self.fc = nn.Linear(self.hidden_size, num_labels)
        self.dropout = nn.Dropout(0.5)
        self.layer_norm = nn.LayerNorm(self.hidden_size)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        hidden_states = outputs.hidden_states

        L9 = hidden_states[-4]
        L10 = hidden_states[-3]
        L11 = hidden_states[-2]
        L12 = hidden_states[-1]

        weighted_sum = self.gates[0] * L9 + self.gates[1] * L10 + self.gates[2] * L11 + self.gates[3] * L12
        normalized_sum = self.layer_norm(weighted_sum)
        cls_representation = normalized_sum[:, 0, :]

        logits = self.fc(self.dropout(cls_representation))

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
            return {"loss": loss, "logits": logits}
        return logits

In [None]:
whla_model = WHLA_BERT(pretrained_model="bert-base-uncased", num_labels=2)

In [None]:
whla_model_training_args = TrainingArguments(
    output_dir="./whla_model_results",
    eval_strategy="epoch",
    # eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="none",
    logging_steps=500,
)

whla_model_trainer = Trainer(
    model=whla_model,
    args=whla_model_training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

whla_model_trainer.train()

whla_model_trainer.evaluate()

# WHLA Bert Additional Training ( 2 more epoch )

In [None]:
batch_size = 16
epochs = 2
total_steps = (len(train_dataset) // batch_size) * epochs

In [None]:
whla_model_training_args = TrainingArguments(
    output_dir="./whla_model_additional_trg_results",
    eval_strategy="epoch",
    # eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="none",
    logging_steps=500,
)

whla_model_trainer = Trainer(
    model=whla_model,
    args=whla_model_training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

whla_model_trainer.train()

whla_model_trainer.evaluate()

# Pre trained BERT

In [None]:
!unzip -q /content/drive/MyDrive/Capstone/bert-pretrain-socialmedia-model.zip -d ./

In [None]:
pretrained_model = BertForSequenceClassification.from_pretrained("./bert-pretrain-socialmedia-model", num_labels=2)

In [None]:
pretrained_model_training_args = TrainingArguments(
    output_dir="./pretrained_model_results",
    eval_strategy="epoch",
    # eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="none",
    logging_steps=500,
)

pretrained_model_trainer = Trainer(
    model=pretrained_model,
    args=pretrained_model_training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

pretrained_model_trainer.train()

pretrained_model_trainer.evaluate()

pretrained_model.save_pretrained('./pretrained_model')
tokenizer.save_pretrained('./pretrained_model')

# Pre trained Bert additional training ( 2 more epoch )

In [None]:
batch_size = 16
epochs = 2
total_steps = (len(train_dataset) // batch_size) * epochs

In [None]:
pretrained_model_training_args = TrainingArguments(
    output_dir="./pretrained_model_additional_trg_results",
    eval_strategy="epoch",
    # eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="none",
    logging_steps=500,
)

pretrained_model_trainer = Trainer(
    model=pretrained_model,
    args=pretrained_model_training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

pretrained_model_trainer.train()

pretrained_model_trainer.evaluate()

pretrained_model.save_pretrained('./pretrained_model_additional_trg')
tokenizer.save_pretrained('./pretrained_model_additional_trg')

# Expanded vocab BERT

In [None]:
!unzip -q '/content/drive/MyDrive/Trained Models/expanded_vocab_model' -d ./

In [None]:
EXPANDED_TOKEN_LIMIT = 512

In [None]:
tokenizer = BertTokenizer.from_pretrained('./expanded_vocab_model')

# To verify if tokenizer is expanded properly, else can ignore
test_slang = "gratz"
test_emoji = "✅"

slang_id = tokenizer.convert_tokens_to_ids(test_slang)
emoji_id = tokenizer.convert_tokens_to_ids(test_emoji)

print(f"Token ID for slang '{test_slang}': {slang_id}")
print(f"Token ID for emoji '{test_emoji}': {emoji_id}")

if slang_id == tokenizer.unk_token_id:
    print(f"Slang '{test_slang}' is not in the vocabulary.")
else:
    print(f"Slang '{test_slang}' is in the vocabulary.")

if emoji_id == tokenizer.unk_token_id:
    print(f"Emoji '{test_emoji}' is not in the vocabulary.")
else:
    print(f"Emoji '{test_emoji}' is in the vocabulary.")

In [None]:
def tokenize_function(data):
    return tokenizer(data['text'], truncation=True, max_length=EXPANDED_TOKEN_LIMIT)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
df = tokenized_dataset.to_pandas()

train_df, eval_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df['source'],
    random_state=RANDOM_SEED
)

train_df = train_df.reset_index(drop=True)
eval_df = eval_df.reset_index(drop=True)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

train_dataset = train_dataset.map(lambda row: {'labels': row['polarity']})
eval_dataset = eval_dataset.map(lambda row: {'labels': row['polarity']})

print(f"Training size: {len(train_dataset)}, Evaluation size: {len(eval_dataset)}")

In [None]:
expanded_vocab_model = BertForSequenceClassification.from_pretrained("./expanded_vocab_bert", num_labels=2)

In [None]:
# To verify if embeddings are initialized properly

embedding_layer = expanded_vocab_model.bert.embeddings.word_embeddings

if slang_id != tokenizer.unk_token_id:
    slang_embedding = embedding_layer.weight.data[slang_id]
    print(f"Embedding for slang '{test_slang}': {slang_embedding}")

if emoji_id != tokenizer.unk_token_id:
    emoji_embedding = embedding_layer.weight.data[emoji_id]
    print(f"Embedding for emoji '{test_emoji}': {emoji_embedding}")

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
batch_size = 16
epochs = 1
total_steps = (len(train_dataset) // batch_size) * epochs

In [None]:
expanded_vocab_training_args = TrainingArguments(
    output_dir="./expanded_vocab_model_results",
    eval_strategy="epoch",
    # eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="none",
    logging_steps=500,
)

expanded_vocab_trainer = Trainer(
    model=expanded_vocab_model,
    args=expanded_vocab_training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

expanded_vocab_trainer.train()

expanded_vocab_trainer.evaluate()

expanded_vocab_model.save_pretrained('./expanded_vocab_model')
tokenizer.save_pretrained('./expanded_vocab_model')

# Expanded vocab BERT additional training ( 2 more epoch )

In [None]:
batch_size = 16
epochs = 2
total_steps = (len(train_dataset) // batch_size) * epochs

In [None]:
expanded_vocab_training_args = TrainingArguments(
    output_dir="./expanded_vocab_model_additional_trg_results",
    eval_strategy="epoch",
    # eval_steps=500,
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=10,
    logging_dir="./logs",
    report_to="none",
    logging_steps=500,
)

expanded_vocab_trainer = Trainer(
    model=expanded_vocab_model,
    args=expanded_vocab_training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

pretrained_model_trainer.train()

expanded_vocab_trainer.evaluate()

pretrained_model.save_pretrained('./expanded_vocab_model_additional_trg')
tokenizer.save_pretrained('./expanded_vocab_model_additional_trg')