# 1. Import Required Libraries

In [None]:
import torch
import transformers
import datasets
import accelerate

In [None]:
import warnings
warnings.filterwarnings('ignore')

# 2. Suppress Warnings

In [4]:
import pandas as pd
from datasets import Dataset

data = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# 3. Load and Train-Test Split

In [5]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

# 4. Check Class Distribution

In [6]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# 6. Encode Labels

In [7]:
label2id = {'negative': 0, 'positive': 1}
id2label = {0:'negative', 1:'positive'}

dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})

Map: 100%|██████████| 35000/35000 [00:02<00:00, 11773.67 examples/s]
Map: 100%|██████████| 15000/15000 [00:01<00:00, 12074.18 examples/s]


In [8]:
dataset['train'][0]

{'review': 'I saw this film numerous times in the late 60\'s/early 70\'s whenever it reared it\'s head like a reindeer with rabies every November-December as a Saturday matinée kiddie show.It was always stiff competition for THE CHRSTMAS THAT ALMOST WASN\'T (oops-can I SAY "Christmas"?), perhaps the greatest,most iconic Christmas-season film of all time.But that\'s another review.<br /><br />At the time,I marveled that the on-screen tint of SANTA CLAUS was almost "pink and white", so much had the color of the sprocket-torn prints changed color.<br /><br />The film is kinda creepy! I thought so then--and still do, actually. I was highly entertained then, as I still am! It\'s amusing in a "retarted-elf" sort of way. By the way,the image quality looks much better on the DVD I have now than it did in the theater, circa 1969-74.<br /><br />If you are expecting maybe "the lost RANKIN-BASS Christmas special-forget it! If you want FELLINI DOES Christmas--read on...<br /><br />By nature, the du

# 7. Setup Tokenizer

In [9]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)

# 8. Tokenize Dataset

In [10]:
tokenizer(dataset['train'][0]['review'])

def tokenize(batch):
    temp = tokenizer(batch['review'], padding=True, truncation=True, max_length=300)
    return temp

dataset = dataset.map(tokenize, batched=True, batch_size=None)

Map: 100%|██████████| 35000/35000 [00:25<00:00, 1356.30 examples/s]
Map: 100%|██████████| 15000/15000 [00:07<00:00, 2026.17 examples/s]


In [11]:
dataset['train'][0].keys()

dict_keys(['review', 'sentiment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'])

# 9. Define Metrics

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# 10. Load Model

In [13]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=len(label2id), label2id=label2id, id2label=id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 11. Configure Training

In [14]:
args = TrainingArguments(
    output_dir='train_dir',
    overwrite_output_dir=True,
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

# 12. Train Model

In [15]:
trainer.train()

 15%|█▌        | 500/3282 [03:32<20:17,  2.29it/s]

{'loss': 0.4625, 'grad_norm': 17.964859008789062, 'learning_rate': 1.695307739183425e-05, 'epoch': 0.46}


 30%|███       | 1000/3282 [09:34<31:06,  1.22it/s]

{'loss': 0.3519, 'grad_norm': 13.26689338684082, 'learning_rate': 1.3906154783668494e-05, 'epoch': 0.91}


                                                   
 33%|███▎      | 1094/3282 [13:07<27:38,  1.32it/s]

{'eval_loss': 0.3114200234413147, 'eval_accuracy': 0.8658666666666667, 'eval_runtime': 135.0101, 'eval_samples_per_second': 111.103, 'eval_steps_per_second': 3.474, 'epoch': 1.0}


 46%|████▌     | 1500/3282 [18:43<24:33,  1.21it/s]   

{'loss': 0.3062, 'grad_norm': 22.937177658081055, 'learning_rate': 1.0859232175502743e-05, 'epoch': 1.37}


 61%|██████    | 2000/3282 [25:34<17:16,  1.24it/s]

{'loss': 0.2882, 'grad_norm': 9.670245170593262, 'learning_rate': 7.81230956733699e-06, 'epoch': 1.83}


                                                   
 67%|██████▋   | 2188/3282 [30:15<13:21,  1.36it/s]

{'eval_loss': 0.30997201800346375, 'eval_accuracy': 0.8687333333333334, 'eval_runtime': 129.4258, 'eval_samples_per_second': 115.897, 'eval_steps_per_second': 3.624, 'epoch': 2.0}


 76%|███████▌  | 2500/3282 [34:28<10:31,  1.24it/s]   

{'loss': 0.2726, 'grad_norm': 10.428123474121094, 'learning_rate': 4.765386959171238e-06, 'epoch': 2.29}


 91%|█████████▏| 3000/3282 [41:15<03:43,  1.26it/s]

{'loss': 0.2507, 'grad_norm': 8.650588035583496, 'learning_rate': 1.7184643510054846e-06, 'epoch': 2.74}


                                                   
100%|██████████| 3282/3282 [47:21<00:00,  1.16it/s]

{'eval_loss': 0.2992258071899414, 'eval_accuracy': 0.8818666666666667, 'eval_runtime': 137.2786, 'eval_samples_per_second': 109.267, 'eval_steps_per_second': 3.416, 'epoch': 3.0}
{'train_runtime': 2841.0051, 'train_samples_per_second': 36.959, 'train_steps_per_second': 1.155, 'train_loss': 0.31575205847666366, 'epoch': 3.0}





TrainOutput(global_step=3282, training_loss=0.31575205847666366, metrics={'train_runtime': 2841.0051, 'train_samples_per_second': 36.959, 'train_steps_per_second': 1.155, 'total_flos': 882184338000000.0, 'train_loss': 0.31575205847666366, 'epoch': 3.0})

# 13. Evaluate Model

In [16]:
trainer.evaluate()

100%|██████████| 469/469 [02:10<00:00,  3.61it/s]


{'eval_loss': 0.2992258071899414,
 'eval_accuracy': 0.8818666666666667,
 'eval_runtime': 130.3105,
 'eval_samples_per_second': 115.11,
 'eval_steps_per_second': 3.599,
 'epoch': 3.0}

# 14. Save Model

In [17]:
trainer.save_model('tinybert-sentiment-analysis')

# 15. Test Predictions

In [2]:
data = ['this movie was horrible, the plot was really boring. acting was okay',
        'the movie is really sucked. there is not plot and acting was bad',
        'what a beautiful movie. great plot. acting was good. will see it again']

In [3]:
from transformers import pipeline
import torch

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

classifier = pipeline('text-classification', model='tinybert-sentiment-analysis', device=device)

classifier(data)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


[{'label': 'negative', 'score': 0.9905874133110046},
 {'label': 'negative', 'score': 0.9908849596977234},
 {'label': 'positive', 'score': 0.9896246790885925}]