# Install (minimal)

In [1]:
!pip -q install transformers datasets accelerate evaluate scikit-learn

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, f1_score

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

# Create tiny text classification dataset (no downloads)

We’ll do binary classification: Positive vs Negative.

In [2]:
texts = [
    "I love this system, it works perfectly.",
    "This is amazing and very helpful.",
    "Great experience, I am satisfied.",
    "Fantastic results and easy to use.",
    "I hate this tool, it is terrible.",
    "This is disappointing and buggy.",
    "Worst experience ever, useless.",
    "It keeps failing and wasting my time.",
    "Not bad, acceptable.",
    "Okay, could be improved.",
    "I am happy with the output.",
    "I am not satisfied with this."
]

# 1 = positive, 0 = negative
labels = [1,1,1,1,0,0,0,0,1,1,1,0]

df = pd.DataFrame({"text": texts, "label": labels})
df

Unnamed: 0,text,label
0,"I love this system, it works perfectly.",1
1,This is amazing and very helpful.,1
2,"Great experience, I am satisfied.",1
3,Fantastic results and easy to use.,1
4,"I hate this tool, it is terrible.",0
5,This is disappointing and buggy.,0
6,"Worst experience ever, useless.",0
7,It keeps failing and wasting my time.,0
8,"Not bad, acceptable.",1
9,"Okay, could be improved.",1


# Train/test split

In [3]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.25, random_state=42, stratify=df["label"])
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print("Train:", len(train_df), "Test:", len(test_df))
train_df.head()

Train: 9 Test: 3


Unnamed: 0,text,label
0,This is disappointing and buggy.,0
1,"I love this system, it works perfectly.",1
2,Fantastic results and easy to use.,1
3,I am not satisfied with this.,0
4,This is amazing and very helpful.,1


# Tokenizer + dataset object

In [4]:
from datasets import Dataset
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_ds = Dataset.from_pandas(train_df)
test_ds  = Dataset.from_pandas(test_df)

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=64)

train_tok = train_ds.map(tokenize, batched=True)
test_tok  = test_ds.map(tokenize, batched=True)

cols = ["input_ids", "attention_mask", "label"]
train_tok.set_format(type="torch", columns=cols)
test_tok.set_format(type="torch", columns=cols)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

# Metrics function

In [5]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# EXPERIMENT 1: Feature Extraction (Freeze base)

## Load model, freeze base layers

In [14]:
from transformers import AutoModelForSequenceClassification

# ----- Feature Extraction model (freeze all base) -----
model_fe = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

for p in model_fe.base_model.parameters():
    p.requires_grad = False

# ----- Fine-tuning model (unfreeze last layer only) -----
model_ft = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# freeze all first
for p in model_ft.base_model.parameters():
    p.requires_grad = False

# unfreeze LAST transformer layer + classifier head
for p in model_ft.distilbert.transformer.layer[-1].parameters():
    p.requires_grad = True

for p in model_ft.pre_classifier.parameters():
    p.requires_grad = True
for p in model_ft.classifier.parameters():
    p.requires_grad = True

# quick sanity prints
def count_trainable(m):
    return sum(p.numel() for p in m.parameters() if p.requires_grad), sum(p.numel() for p in m.parameters())

fe_trainable, fe_total = count_trainable(model_fe)
ft_trainable, ft_total = count_trainable(model_ft)

print(f"Feature extraction trainable: {fe_trainable:,} / {fe_total:,}")
print(f"Fine-tune trainable:        {ft_trainable:,} / {ft_total:,}")

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Feature extraction trainable: 592,130 / 66,955,010
Fine-tune trainable:        7,680,002 / 66,955,010


## Train Feature Extraction (1 epoch only)

In [15]:
from transformers import TrainingArguments, Trainer

args_fe = TrainingArguments(
    output_dir="tl_feature_extraction",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.0,
    logging_steps=1,
    report_to="none"
)

trainer_fe = Trainer(
    model=model_fe,
    args=args_fe,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    compute_metrics=compute_metrics
)

trainer_fe.train()
fe_metrics = trainer_fe.evaluate()
print("Feature Extraction metrics:", fe_metrics)

  super().__init__(loader)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.764831,0.638287,0.666667,0.8


  super().__init__(loader)


Feature Extraction metrics: {'eval_loss': 0.6382869482040405, 'eval_accuracy': 0.6666666666666666, 'eval_f1': 0.8, 'eval_runtime': 0.5031, 'eval_samples_per_second': 5.963, 'eval_steps_per_second': 1.988, 'epoch': 1.0}


## Fine-tuning Training (Unfreeze last layer)

In [16]:
args_ft = TrainingArguments(
    output_dir="tl_finetune_last_layer",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    report_to="none"
)

trainer_ft = Trainer(
    model=model_ft,
    args=args_ft,
    train_dataset=train_tok,
    eval_dataset=test_tok,
    compute_metrics=compute_metrics
)

trainer_ft.train()
ft_metrics = trainer_ft.evaluate()
print("Fine-tuning metrics:", ft_metrics)

  super().__init__(loader)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.792653,0.710488,0.333333,0.0


  super().__init__(loader)


Fine-tuning metrics: {'eval_loss': 0.710488498210907, 'eval_accuracy': 0.3333333333333333, 'eval_f1': 0.0, 'eval_runtime': 0.5116, 'eval_samples_per_second': 5.864, 'eval_steps_per_second': 1.955, 'epoch': 1.0}


## Summary + Save artifacts

In [17]:
import os
import pandas as pd

summary = pd.DataFrame([
    {"approach": "feature_extraction_freeze_all", "accuracy": fe_metrics["eval_accuracy"], "f1": fe_metrics["eval_f1"], "loss": fe_metrics["eval_loss"]},
    {"approach": "fine_tune_last_layer",          "accuracy": ft_metrics["eval_accuracy"], "f1": ft_metrics["eval_f1"], "loss": ft_metrics["eval_loss"]},
])

display(summary)

os.makedirs("artifacts_transfer_learning", exist_ok=True)
summary.to_csv("artifacts_transfer_learning/tl_results.csv", index=False)

!zip -r artifacts_transfer_learning.zip artifacts_transfer_learning
print("Done: artifacts_transfer_learning.zip created")

Unnamed: 0,approach,accuracy,f1,loss
0,feature_extraction_freeze_all,0.666667,0.8,0.638287
1,fine_tune_last_layer,0.333333,0.0,0.710488


  adding: artifacts_transfer_learning/ (stored 0%)
  adding: artifacts_transfer_learning/tl_results.csv (deflated 31%)
Done: artifacts_transfer_learning.zip created


- Observation: Feature extraction performed better (Acc=0.67, F1=0.80) than fine-tuning last layer (Acc=0.33, F1=0.00) on this tiny dataset.

- Reason: With very small data, fine-tuning even one transformer layer can overfit or become unstable, while freezing the base keeps representations general.

- Conclusion: For low-data / low-resource settings, feature extraction is often safer; fine-tuning typically needs more data, more epochs, and careful hyperparameters.