# Sentiment Analysis Using Tiny-Bert and AWS MLOps

## Load Data



In [1]:
!pip install evaluate
!pip install transformers datasets evaluate accelerate
print("finish", end = '\r')

finish

In [2]:
import pandas as pd
from datasets import Dataset

train_data = pd.read_csv("/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv")
test_data = pd.read_csv("/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Test.csv")
valid_data = pd.read_csv("/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Valid.csv")


train_data.shape, test_data.shape, valid_data.shape

((40000, 2), (5000, 2), (5000, 2))

In [3]:
train_data.head(5)

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [4]:
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
valid_dataset = Dataset.from_pandas(valid_data)


In [5]:
from datasets import DatasetDict

dataset = DatasetDict()
dataset['train'] = train_dataset
dataset['test'] = test_dataset
dataset['valid'] = valid_dataset

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [6]:
 train_data['label'].value_counts()

label
0    20019
1    19981
Name: count, dtype: int64

In [7]:
id2label = {0: 'negative', 1: "positive"}
label2id = {'negative': 0, 'postive': 1}

In [8]:
id2label = {0: 'negative', 1: "positive"}
dataset = dataset.map(lambda x: {'sentiment': id2label[x['label']]})

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [9]:
dataset['train'][0]

{'text': 'I grew up (b. 1965) watching and loving the Thunderbirds. All my mates at school watched. We played "Thunderbirds" before school, during lunch and after school. We all wanted to be Virgil or Scott. No one wanted to be Alan. Counting down from 5 became an art form. I took my children to see the movie hoping they would get a glimpse of what I loved as a child. How bitterly disappointing. The only high point was the snappy theme tune. Not that it could compare with the original score of the Thunderbirds. Thankfully early Saturday mornings one television channel still plays reruns of the series Gerry Anderson and his wife created. Jonatha Frakes should hand in his directors chair, his version was completely hopeless. A waste of film. Utter rubbish. A CGI remake may be acceptable but replacing marionettes with Homo sapiens subsp. sapiens was a huge error of judgment.',
 'label': 0,
 'sentiment': 'negative'}

## Data Tokenization

In [10]:
from transformers import AutoTokenizer
import torch 

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

model_ckpt = "huawei-noah/TinyBERT_General_4L_312D"

tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast = True)

cuda


In [11]:
tokenizer(dataset['train'][0]['text'])

def tokenize(batch):
    temp = tokenizer(batch['text'], padding=True, truncation=True, max_length = 300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [12]:
dataset['train'][0].keys()

dict_keys(['text', 'label', 'sentiment', 'input_ids', 'token_type_ids', 'attention_mask'])

## Model Evaluation Functions



In [13]:
import evaluate 
import numpy
accuracy = evaluate.load('accuracy')


def compute_metrics(eval_pred):
    predictions, labels= eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)
    

2025-09-23 18:51:16.845947: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758653476.864300     172 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758653476.869613     172 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Model Building 

In [14]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = 2, label2id = label2id, id2label = id2label) 

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 312, padding_idx=0)
      (position_embeddings): Embedding(512, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-1

In [19]:
from transformers import TrainingArguments, Trainer
import numpy as np

args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy='epoch',  
    logging_dir='./logs',         
    logging_steps=10,             
    disable_tqdm=False,           
    report_to="none"              
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['valid'],
    compute_metrics=compute_metrics,
    processing_class=tokenizer    
)

trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy
1,0.2455,0.316064,0.874
2,0.2208,0.300864,0.8808
3,0.2267,0.289096,0.8808




TrainOutput(global_step=1875, training_loss=0.2677478221893311, metrics={'train_runtime': 405.3815, 'train_samples_per_second': 296.017, 'train_steps_per_second': 4.625, 'total_flos': 1008210672000000.0, 'train_loss': 0.2677478221893311, 'epoch': 3.0})

In [20]:
trainer.evaluate()



{'eval_loss': 0.2890959084033966,
 'eval_accuracy': 0.8808,
 'eval_runtime': 6.1522,
 'eval_samples_per_second': 812.719,
 'eval_steps_per_second': 12.841,
 'epoch': 3.0}

In [21]:
print("Finished Training ")

Finished Training 


# Model Save and Load for inference

In [23]:
import shutil

# === Save model + tokenizer ===
save_dir = "tiny_bert_sentiment_analysis"
trainer.save_model(save_dir)           
tokenizer.save_pretrained(save_dir)    

print(f"✅ Model and tokenizer saved in: {save_dir}")

# === Zip the folder ===
zip_filename = f"{save_dir}.zip"
shutil.make_archive(save_dir, 'zip', save_dir)
print(f"✅ Model zipped at: {zip_filename}")

# === Download ===
from IPython.display import FileLink
display(FileLink(zip_filename))

✅ Model and tokenizer saved in: tiny_bert_sentiment_analysis
✅ Model zipped at: tiny_bert_sentiment_analysis.zip
