## Import

In [1]:
!pip install kaggle

from google.colab import files
files.upload()

# Make a directory named kaggle and copy kaggle.json into it
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the IMDB dataset (replace with the actual dataset URL from Kaggle)
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

# Unzip the downloaded dataset
!unzip imdb-dataset-of-50k-movie-reviews.zip



Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 58% 15.0M/25.7M [00:00<00:00, 76.4MB/s]
100% 25.7M/25.7M [00:00<00:00, 103MB/s] 
Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [2]:
import tensorflow as tf
# Check GPU availability
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

2.17.1
Num GPUs Available:  1


## Data Preprocessing

In [3]:
import tensorflow as tf
import pandas as pd
import numpy as np
from transformers import BertTokenizer

# Check GPU availability
print(tf.__version__)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Import Dataset
df=pd.read_csv("IMDB Dataset.csv")

seq_len=512
num_samples=len(df)

# 'np'=numpy,'tf'=tensorflow,'pt'=pytorch
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased')
tokens=tokenizer(df['review'].tolist(),
                 max_length=seq_len,
                 padding=True,
                 truncation=True,
                 return_tensors='np',
                 add_special_tokens=True)
print("Tokens keys: ", tokens.keys())
print("Tokens shape: ", tokens['input_ids'].shape)

2.17.1
Num GPUs Available:  1


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokens keys:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
Tokens shape:  (50000, 512)


In [4]:
print("Saving Tokens...")
# Save Tokens into Numpy Binary format
np.save('movie_xids.npy', tokens['input_ids'])
np.save('movie_xmasks.npy', tokens['attention_mask'])
np.save('movie_type_xids.npy', tokens['token_type_ids'])

# Save Memory
del tokens

print("Saving Labels after mapping...")
# One Hot Encoding labels
df['sentiment'].unique()
labels=np.array(df['sentiment'].map({'positive': 1, 'negative': 0}))
np.save('labels.npy', labels)

# # Test Removing HTML Tags, Handling Emojis, Removing Excess Whitespace
# df['review'] = df['review'].str.replace(r'<.*?>', '', regex=True)
# df['review'] = df['review'].apply(lambda x: x.encode('ascii', 'ignore').decode('ascii'))
# df['review'] = df['review'].str.strip().str.replace(r'\s+', ' ', regex=True)

Saving Tokens...
Saving Labels after mapping...


In [5]:
import numpy as np
import tensorflow as tf

with open('movie_xids.npy', 'rb') as f:
  xids=np.load(f, allow_pickle=True)
with open('movie_xmasks.npy', 'rb') as f:
  xmasks=np.load(f, allow_pickle=True)
with open('labels.npy', 'rb') as f:
  labels=np.load(f, allow_pickle=True)

# Create TF dataset
dataset=tf.data.Dataset.from_tensor_slices((xids, xmasks, labels))
print("\nElement Spec after TF dataset: ", dataset.take(1))

# Rearrange dataset format
def map_func(input_ids, attention_masks, labels):
  return {'input_ids': input_ids, 'attention_mask': attention_masks}, labels

dataset=dataset.map(map_func)
print("\nElement Spec after rearrangement: ", dataset.take(1))

# Dataset Shuffling, Batch, Split and Save
BATCH_SIZE=8
dataset=dataset.shuffle(10000).batch(BATCH_SIZE, drop_remainder=True)

SPLIT=0.8
SIZE=int((xids.shape[0]/BATCH_SIZE)*SPLIT)
train_dataset=dataset.take(SIZE)
val_dataset=dataset.skip(SIZE)

# Save file, `use tf.data.Dataset.save(...) instead`
tf.data.experimental.save(train_dataset, 'train_dataset')
tf.data.experimental.save(val_dataset, 'val_dataset')
print("\nTrain and Val Dataset saved")

print("\nTrain Element Spec: ", train_dataset.element_spec)
print("\nVal Element Spec: ", val_dataset.element_spec)

Instructions for updating:
Use `tf.data.Dataset.save(...)` instead.



Element Spec after TF dataset:  <_TakeDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.int64, name=None), TensorSpec(shape=(512,), dtype=tf.int64, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

Element Spec after rearrangement:  <_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int64, name=None)}, TensorSpec(shape=(), dtype=tf.int64, name=None))>

Train and Val Dataset saved

Train Element Spec:  ({'input_ids': TensorSpec(shape=(8, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(8, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(8,), dtype=tf.int64, name=None))

Val Element Spec:  ({'input_ids': TensorSpec(shape=(8, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(8, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(8,), dtype=tf.int64, name=None))


## Model Building and Training

In [14]:
import tensorflow as tf
from transformers import TFAutoModel

class BertSentimentModel(tf.keras.Model):
    def __init__(self, bert_model='bert-base-uncased', max_len=512):
        super().__init__()
        self.bert = TFAutoModel.from_pretrained(bert_model)
        self.dense1 = tf.keras.layers.Dense(1024, activation='relu')
        self.dense2 = tf.keras.layers.Dense(1, activation='sigmoid', name='outputs')

    def call(self, inputs):
        input_ids, attention_mask = inputs['input_ids'], inputs['attention_mask']

        # Get BERT embeddings
        bert_outputs = self.bert(input_ids=input_ids,
                               attention_mask=attention_mask,
                               return_dict=True)

        # Use the pooled output for classification
        pooled_output = bert_outputs.pooler_output

        # Dense layers
        x = self.dense1(pooled_output)
        return self.dense2(x)

# Create and compile model
def create_model():
    # Initialize model
    model = BertSentimentModel()

    # Freeze BERT layers
    model.bert.trainable = False

    # Compile model
    # Using standard Adam optimizer instead of legacy version
    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
    loss = tf.keras.losses.BinaryCrossentropy()
    metrics = [tf.keras.metrics.BinaryAccuracy('accuracy')]

    model.compile(optimizer=optimizer,
                 loss=loss,
                 metrics=metrics)

    return model

# Create model instance
model = create_model()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [15]:
# Load datasets
val_ds = tf.data.experimental.load('val_dataset')
train_ds = tf.data.experimental.load('train_dataset', element_spec=val_ds.element_spec)

# Create and compile model
model = create_model()

# Train model
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=2  # Adjust as needed
)

Instructions for updating:
Use `tf.data.Dataset.load(...)` instead.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoi

Epoch 1/2
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1848s[0m 366ms/step - accuracy: 0.6788 - loss: 0.6151 - val_accuracy: 0.7698 - val_loss: 0.5136
Epoch 2/2
[1m5000/5000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1881s[0m 368ms/step - accuracy: 0.7765 - loss: 0.4977 - val_accuracy: 0.7982 - val_loss: 0.4599

Saving Model...


ValueError: Invalid filepath extension for saving. Please add either a `.keras` extension for the native Keras format (recommended) or a `.h5` extension. Use `model.export(filepath)` if you want to export a SavedModel for use with TFLite/TFServing/etc. Received: filepath=BERT-uncased-model.

## Save model into zip file, then save into local disk

In [17]:
import os
import zipfile

# Save the model
print("\nSaving Model...")
model.save('BERT-uncased-model.keras')

# Compress the model into a .zip file
model_filename = 'BERT-uncased-model.keras'
zip_filename = 'BERT-uncased-model.zip'

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(model_filename)

print(f"Model saved and compressed as {zip_filename}")

Model saved and compressed as BERT-uncased-model.zip


In [18]:
from google.colab import files

files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Make Predictions on test file, and on user's input

In [21]:
def prep_data(text):
    tokens=tokenizer.encode_plus(text,
                                 max_length=512,
                                 truncation=True,
                                 padding='max_length',
                                 add_special_token=True,
                                 return_tensors='np')
    # Return `int32` tensors by default, use `tf.cast` to return `float64`
    return {'input_ids':tf.cast(tokens['input_ids'], tf.int64),
            'attention_mask':tf.cast(tokens['attention_mask'], tf.int64)}

sentence = "I loved the movie"
probs = model.predict(prep_data(sentence))[0]
print('Probability: \n', probs)

# pd.set_option('display.max_colwidth', None)

Keyword arguments {'add_special_token': True} not recognized.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13s/step
Probability: 
 [0.48595783]


In [29]:
import tensorflow as tf
import numpy as np
from transformers import BertTokenizer

def predict_sentiment(model, text, tokenizer, seq_len=512):
    """
    Make sentiment predictions on new text input.
    Returns prediction label (positive/negative) instead of probabilities.

    Args:
        model: Trained BertSentimentModel instance
        text: String or list of strings to analyze
        tokenizer: BERT tokenizer
        seq_len: Maximum sequence length

    Returns:
        List of predictions ("positive" or "negative")
    """
    # Handle single string input
    if isinstance(text, str):
        text = [text]

    # Tokenize input text
    tokens = tokenizer(text,
                      max_length=seq_len,
                      padding=True,
                      truncation=True,
                      return_tensors='np',
                      add_special_tokens=True)

    # Create batch of size matching the input
    batch_size = len(text)
    dataset = tf.data.Dataset.from_tensor_slices((
        tokens['input_ids'],
        tokens['attention_mask']
    )).batch(batch_size)

    # Format input and get predictions
    for batch in dataset:
        input_ids, attention_mask = batch
        model_input = {
            'input_ids': input_ids,
            'attention_mask': attention_mask
        }
        predictions = model(model_input, training=False)

    # Convert probabilities to labels
    predictions = predictions.numpy()
    labels = ["positive" if pred >= 0.5 else "negative" for pred in predictions]

    return labels

def evaluate_model(model, dataset):
    """
    Evaluate model performance on a validation/test dataset.

    Args:
        model: Trained BertSentimentModel instance
        dataset: TensorFlow dataset in the format created earlier

    Returns:
        Dictionary containing metrics (accuracy, precision, recall, f1)
    """
    # Use model.evaluate() which is much faster than manual prediction
    results = model.evaluate(dataset, verbose=1)

    # Get metrics based on your model's compiled metrics
    metrics = {
        'loss': results[0],
        'accuracy': results[1]
    }

    # If you need additional metrics, you can compute them separately
    y_pred = []
    y_true = []

    # Use model() instead of predict() and process in batches
    for batch in dataset:
        x, y = batch
        pred = model(x, training=False)
        y_pred.extend(pred.numpy() >= 0.5)
        y_true.extend(y.numpy())

    # Convert to numpy arrays for metric calculation
    y_pred = np.array(y_pred)
    y_true = np.array(y_true)

    # Calculate additional metrics
    tp = np.sum((y_pred == 1) & (y_true == 1))
    fp = np.sum((y_pred == 1) & (y_true == 0))
    fn = np.sum((y_pred == 0) & (y_true == 1))

    precision = tp / (tp + fp + 1e-7)
    recall = tp / (tp + fn + 1e-7)
    f1 = 2 * (precision * recall) / (precision + recall + 1e-7)

    metrics.update({
        'precision': precision,
        'recall': recall,
        'f1': f1
    })

    return metrics

# For making predictions:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
example_text = ["This movie was fantastic!", "I really hated this film."]
predictions = predict_sentiment(model, example_text, tokenizer)

# # Example usage:
# """
# # For making predictions:
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# example_text = ["This movie was fantastic!", "I really hated this film."]
# predictions = predict_sentiment(model, example_text, tokenizer)

# # For evaluation:
# metrics = evaluate_model(model, val_ds)
# print("Validation Metrics:", metrics)
# """

# Print predictions
print("\nPredictions:")
for text, pred in zip(example_text, predictions):
    print(f"Text: '{text}'")
    print(f"Sentiment: {pred}\n")

# # For evaluation:
# print("Evaluating model on validation dataset...")
# metrics = evaluate_model(model, val_ds)
# print("\nValidation Metrics:")
# for metric_name, value in metrics.items():
#     print(f"{metric_name}: {value:.4f}")


Predictions:
Text: 'This movie was fantastic!'
Sentiment: positive

Text: 'I really hated this film.'
Sentiment: negative

