In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Dataset

RANDOM_SEED = 69

# For training

In [None]:
# Load custom dataset from CSV
dataset = load_dataset('csv', data_files='Datasets/Cleaned/finetuning_dataset.csv')

# Access the train split
train_dataset = dataset['train']

# Perform train-test split
train_test_split = train_dataset.train_test_split(test_size=0.1)

# Access the new splits
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Inspect the splits
print(f"Training size: {len(train_dataset)}, Evaluation size: {len(eval_dataset)}")

# For testing pipeline

In [None]:
# Load the dataset from CSV
dataset = load_dataset('csv', data_files='Datasets/Cleaned/finetuning_dataset.csv')['train']

# Take 10% of the dataset after shuffling
sampled_dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.1)))

# Perform train-test split
train_test_split = sampled_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Inspect the splits
print(f"Training size: {len(train_dataset)}, Evaluation size: {len(eval_dataset)}")

In [None]:
# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
model = BertForSequenceClassification.from_pretrained('../bert-pretrain-socialmedia')
tokenizer = BertTokenizer.from_pretrained('../bert-pretrain-socialmedia')

In [None]:
# Tokenize Dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)

tokenized_train = tokenized_train.map(lambda examples: {'labels': examples['polarity']})
tokenized_eval = tokenized_eval.map(lambda examples: {'labels': examples['polarity']})

In [None]:
tokenized_train

In [None]:
# Inspect the First Row of Tokenized Train Dataset
row = tokenized_train[40000]

# Print `input_ids`
print("Input IDs:", row['input_ids'])
print("Label:", row['labels'])

# Decode Back to Text (Optional)
decoded_text = tokenizer.decode(row['input_ids'], skip_special_tokens=True)
print("Decoded Text:", decoded_text)

In [6]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Evaluation Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [7]:
batch_size = 16
epochs = 2
total_steps = (len(tokenized_train) // batch_size) * epochs

In [None]:
training_args = TrainingArguments(
    output_dir="./whla_bert_results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=epochs,
    weight_decay=0.01,
    warmup_steps=int(0.1 * total_steps),
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none",
    logging_steps=500,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

trainer.evaluate()

model.save_pretrained('./finetuned_sentiment_model')
tokenizer.save_pretrained('./finetuned_sentiment_model')