In [2]:
import pandas as pd
from datasets import Dataset

data = pd.read_csv('https://github.com/Anmol-Shrestha/All-DATASETS/raw/refs/heads/main/IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
dataset = Dataset.from_pandas(data)
dataset

Dataset({
    features: ['review', 'sentiment'],
    num_rows: 50000
})

In [None]:
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

In [5]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [6]:
label2id = {'negative':0, 'positive':1}

id2label = {0:'negative', 1:'positive'}

# Assign label property to each row, label value == label2id['positive'] = 1
dataset = dataset.map(lambda example: {'label': label2id[example['sentiment']]})

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [7]:
dataset['train'][0]

{'review': 'During WW2 in the Philipines, Japanese soldiers are starving, dying, growing weak, and becoming more and more insane. A small group of soldiers, trying to stay alive, have eventually resorted to cannibalism. This film perfectly portrays the insanity that overtakes people under extreme conditions. There are a few humorous parts in this movie, but the majority of it is just a very slow moving and realistic film. It follows these soldiers from one painful moment to another, and eventually to death. A very interesting film, showing the death, and the horror, of what may have been the worst war the world has ever seen.',
 'sentiment': 'positive',
 'label': 1}

### Data Tokenization

In [8]:
# Tokenize the reviwes

from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# 4 Encoder layers with 312 dimensions
# Max Context length : 512 tokens
model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

In [9]:
def tokenize(batch):
  temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=300)
  # padding will make length of all tokens equal
  return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)


Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [10]:
dataset['train'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

In [11]:
dataset['test'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

### Evaluation

In [12]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [13]:
import evaluate
import numpy as np
metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script: 0.00B [00:00, ?B/s]

### Model Building

In [14]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Training Arguments = configuration for Trainer
# AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
args = TrainingArguments(output_dir='train_dir',
                         report_to="none",
                         overwrite_output_dir=True,
                         num_train_epochs= 3,
                         learning_rate=2e-5,
                         per_device_train_batch_size=32,
                         per_device_eval_batch_size=32,
                         eval_strategy="epoch")

trainer = Trainer(
    model=model,
    args=args,
    train_dataset = dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics= compute_metrics,
    tokenizer=tokenizer)

  trainer = Trainer(


In [16]:
trainer.train()

model.safetensors:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.359,0.31202,0.864467
2,0.2987,0.282694,0.878933
3,0.2581,0.281561,0.882533


TrainOutput(global_step=3282, training_loss=0.3211383552249069, metrics={'train_runtime': 518.7128, 'train_samples_per_second': 202.424, 'train_steps_per_second': 6.327, 'total_flos': 882184338000000.0, 'train_loss': 0.3211383552249069, 'epoch': 3.0})

In [17]:
trainer.evaluate()

{'eval_loss': 0.2815611660480499,
 'eval_accuracy': 0.8825333333333333,
 'eval_runtime': 28.4946,
 'eval_samples_per_second': 526.415,
 'eval_steps_per_second': 16.459,
 'epoch': 3.0}

### Save Model

In [22]:
trainer.save_model('tinybert-sentiment-analysis')

In [20]:
data = ['This movie is not good','I am disappointed in the plot', 'this is boring','']

### Production Code Snippet
This same code snippet can be used in Production API

In [21]:
from transformers import pipeline
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-sentiment-analysis', device=device)

classifier(data)

Device set to use cuda


[{'label': 'negative', 'score': 0.9846214056015015},
 {'label': 'negative', 'score': 0.9874770641326904},
 {'label': 'negative', 'score': 0.9882628321647644},
 {'label': 'negative', 'score': 0.5484742522239685}]