In [1]:
!pip install transformers datasets


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [2]:
import os
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [4]:
# Define file paths; update these paths if necessary.

from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define file paths within your Google Drive
drive_path = '/content/drive/MyDrive/'

train_csv = os.path.join(drive_path, 'dataset_splits/train/train.csv')
val_csv = os.path.join(drive_path, 'dataset_splits/val/val.csv')
test_csv = os.path.join(drive_path, 'dataset_splits/test/test.csv')

data_files = {"train": train_csv, "validation": val_csv, "test": test_csv}
dataset = load_dataset("csv", data_files=data_files)

# Optional: display dataset splits sizes
print("Train samples:", len(dataset["train"]))
print("Validation samples:", len(dataset["validation"]))
print("Test samples:", len(dataset["test"]))

Mounted at /content/drive


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Train samples: 3344
Validation samples: 1115
Test samples: 1115


In [6]:
def rename_columns(example):
    return {"text": example["sms"], "label": int(example["label"])}

dataset = dataset.map(rename_columns, remove_columns=dataset["train"].column_names)

# Verify the columns
print("Columns in train split:", dataset["train"].column_names)

Map:   0%|          | 0/3344 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

Columns in train split: ['label', 'text']


In [8]:
# Set the ELECTRA checkpoint
model_checkpoint = "google/electra-small-discriminator"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

# Define the tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

# Tokenize the dataset (batched for speed)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Create a data collator that dynamically pads the inputs
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1115 [00:00<?, ? examples/s]

In [9]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary", pos_label=1)
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


In [11]:


training_args = TrainingArguments(
    output_dir="./results_electra",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs_electra",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_strategy="epoch",  # Add this line to match the evaluation strategy
    report_to="none",  # Disable WandB logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)



  trainer = Trainer(


In [12]:
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1058,0.080428,0.988341,0.947712,0.966667,0.957096
2,0.0525,0.053964,0.989238,0.972603,0.946667,0.959459
3,0.0219,0.049375,0.990135,0.972789,0.953333,0.962963


TrainOutput(global_step=627, training_loss=0.1250345015354704, metrics={'train_runtime': 59.7779, 'train_samples_per_second': 167.821, 'train_steps_per_second': 10.489, 'total_flos': 34955130127296.0, 'train_loss': 0.1250345015354704, 'epoch': 3.0})

In [13]:
test_results = trainer.evaluate(tokenized_datasets["test"])
print("Test Set Evaluation Results:")
print(test_results)


Test Set Evaluation Results:
{'eval_loss': 0.05280797556042671, 'eval_accuracy': 0.989237668161435, 'eval_precision': 0.9790209790209791, 'eval_recall': 0.9395973154362416, 'eval_f1': 0.958904109589041, 'eval_runtime': 1.6693, 'eval_samples_per_second': 667.93, 'eval_steps_per_second': 41.933, 'epoch': 3.0}


In [17]:
absolute_save_path = "/content/drive/MyDrive/fine_tuned_electra"

trainer.save_model(absolute_save_path)
tokenizer.save_pretrained(absolute_save_path)
print("Fine-tuned ELECTRA model and tokenizer saved successfully at", absolute_save_path)


Fine-tuned ELECTRA model and tokenizer saved successfully at /content/drive/MyDrive/fine_tuned_electra
