<a href="https://colab.research.google.com/github/ChintPatel/CMPE258-HW8/blob/main/HW9_Transformers_and_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [25]:
!pip install --quiet tensorflow tensorflow-text tensorflow-hub transformers datasets keras-nlp

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.5/19.5 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m
[?25h

1. Inference with a Pretrained Classifier

In [26]:
from transformers import pipeline

# — Text Generation (GPT-2) —
gen = pipeline("text-generation", model="gpt2")
print("=== Text Generation ===")
print(gen("In a world where AI rules,", max_length=50, do_sample=True)[0]["generated_text"])

# — Sentiment Analysis (DistilBERT) —
clf = pipeline("sentiment-analysis")
print("\n=== Sentiment Analysis ===")
for txt in ["I love this movie!", "This was the worst book I’ve read."]:
    print(f"{txt} ->", clf(txt)[0])


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


=== Text Generation ===


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In a world where AI rules, you'd be forgiven for thinking this scenario isn't realistic. There are still big, powerful AI agents, and some of the most well-known are:

The US Department of Justice (DOJ) The


Device set to use cpu



=== Sentiment Analysis ===
I love this movie! -> {'label': 'POSITIVE', 'score': 0.9998775720596313}
This was the worst book I’ve read. -> {'label': 'NEGATIVE', 'score': 0.9997784495353699}


2. Fine-tuning a Pretrained Backbone (DistilBERT → IMDb)

In [27]:
# Cell 2: Load IMDb via TensorFlow Datasets instead of 🤗datasets
import tensorflow_datasets as tfds
from transformers import DistilBertTokenizerFast

# 1. Download & split
(raw_train, raw_test), ds_info = tfds.load(
    'imdb_reviews',
    split=['train', 'test'],
    shuffle_files=True,
    as_supervised=True,
    with_info=True
)

# 2. Prepare tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
max_length = 128

print(f"✅ Loaded IMDb: {ds_info.splits['train'].num_examples} train, "
      f"{ds_info.splits['test'].num_examples} test")


✅ Loaded IMDb: 25000 train, 25000 test


Build & Train Your Own Transformer from Scratch

In [28]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from tensorflow.keras import layers

# 1. Load & split IMDb from TFDS
all_train = tfds.load('imdb_reviews', split='train', as_supervised=True)
all_train = all_train.shuffle(10_000, seed=42)
train_ds  = all_train.take(5000)
val_ds    = all_train.skip(5000).take(2000)

# 2. Text Vectorization layer
max_features  = 20_000
sequence_len  = 200
vectorize = layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_len
)
# Adapt on the text-only stream
vectorize.adapt(train_ds.map(lambda text, label: text))

# 3. Define a single Transformer block
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.attn       = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn        = keras.Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(embed_dim),
        ])
        self.ln1        = layers.LayerNormalization()
        self.ln2        = layers.LayerNormalization()
        self.drop1      = layers.Dropout(rate)
        self.drop2      = layers.Dropout(rate)

    # Modified call method to accept *args and **kwargs
    def call(self, inputs, training=None, *args, **kwargs):
        attn_out = self.attn(inputs, inputs)
        # Pass training to Dropout layers explicitly
        attn_out = self.drop1(attn_out, training=training)
        out1     = self.ln1(inputs + attn_out)
        ffn_out  = self.ffn(out1)
        # Pass training to Dropout layers explicitly
        ffn_out  = self.drop2(ffn_out, training=training)
        return self.ln2(out1 + ffn_out)

# 4. Build the model
embed_dim = 64
num_heads = 2
ff_dim    = 64

inputs = layers.Input(shape=(sequence_len,), dtype='int64')
x = layers.Embedding(max_features, embed_dim)(inputs)
# The functional API should pass 'training' implicitly now
x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x) # This Dropout layer will receive 'training' implicitly
x = layers.Dense(20, activation='relu')(x)
outputs = layers.Dense(1, activation='sigmoid')(x)

scratch_model = keras.Model(inputs, outputs)
scratch_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# 5. Prepare tf.data pipelines
def prepare(ds):
    return (
        ds
        .map(lambda text, label: (vectorize(text), label),
             num_parallel_calls=tf.data.AUTOTUNE)
        .batch(32)
        .prefetch(tf.data.AUTOTUNE)
    )

tf_train = prepare(train_ds)
tf_val   = prepare(val_ds)

# 6. Train!
scratch_model.fit(
    tf_train,
    validation_data=tf_val,
    epochs=3
)

Epoch 1/3
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 242ms/step - accuracy: 0.5730 - loss: 0.6813 - val_accuracy: 0.7705 - val_loss: 0.4517
Epoch 2/3
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 255ms/step - accuracy: 0.8368 - loss: 0.3728 - val_accuracy: 0.8755 - val_loss: 0.3188
Epoch 3/3
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 223ms/step - accuracy: 0.8991 - loss: 0.2549 - val_accuracy: 0.9080 - val_loss: 0.2289


<keras.src.callbacks.history.History at 0x7df88aa886d0>