In [1]:
!pip install transformers dataset




[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
from datasets import load_dataset

dataset = load_dataset('imdb')

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 186553.80 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 293762.38 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 371894.81 examples/s]


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [4]:
dataset["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [5]:
dataset["train"][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [6]:
dataset["unsupervised"][0]

{'text': 'This is just a precious little diamond. The play, the script are excellent. I cant compare this movie with anything else, maybe except the movie "Leon" wonderfully played by Jean Reno and Natalie Portman. But... What can I say about this one? This is the best movie Anne Parillaud has ever played in (See please "Frankie Starlight", she\'s speaking English there) to see what I mean. The story of young punk girl Nikita, taken into the depraved world of the secret government forces has been exceptionally over used by Americans. Never mind the "Point of no return" and especially the "La femme Nikita" TV series. They cannot compare the original believe me! Trash these videos. Buy this one, do not rent it, BUY it. BTW beware of the subtitles of the LA company which "translate" the US release. What a disgrace! If you cant understand French, get a dubbed version. But you\'ll regret later :)',
 'label': -1}

In [8]:
del dataset['unsupervised']

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [10]:
from collections import Counter

label_count = Counter(dataset['train']['label'])
print(label_count)

Counter({0: 12500, 1: 12500})


LOADING TOKENIZER

In [11]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [12]:
print('The vocabulary size is: ', tokenizer.vocab_size)

The vocabulary size is:  30522


In [13]:
print('Maximum context size is: ', tokenizer.model_max_length)

Maximum context size is:  512


In [14]:
print('Name of the fields, model need in the forward pass: ', tokenizer.model_input_names)

Name of the fields, model need in the forward pass:  ['input_ids', 'attention_mask']


In [15]:
inputs = tokenizer("I love to eat apple")
inputs

{'input_ids': [101, 1045, 2293, 2000, 4521, 6207, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [16]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]', 'i', 'love', 'to', 'eat', 'apple', '[SEP]']

In [17]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 25000/25000 [01:32<00:00, 268.90 examples/s]
Map: 100%|██████████| 25000/25000 [01:35<00:00, 261.66 examples/s]


In [18]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
})

In [19]:
tokenized_datasets.set_format("torch")
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [20]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8)
test_dataloader = DataLoader(tokenized_datasets["test"], batch_size=8)

In [21]:
for batch in train_dataloader:
    print(batch["input_ids"].shape)
    print(batch["attention_mask"].shape)
    print(batch["labels"].shape)
    break

torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8])


TRAINING MODEL 

In [22]:
from transformers import DistilBertForSequenceClassification
import torch

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [23]:
from torch.optim import AdamW

num_epochs = 3
learning_rate =2e-5

optimizer = AdamW(model.parameters(), lr=learning_rate)

In [None]:
model.train()

for epoch in range(num_epochs):
    total_training_loss = 0
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items() if k!="text"}

        outputs = model(**batch)
        loss = outputs.loss


        optimizer.zero_grad()  
        loss.backward()        
        optimizer.step()       

        total_training_loss +=  loss
        if step % 500 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Step {step+1}/{len(train_dataloader)}, Loss: {loss:.4f}")

    avg_train_loss = total_training_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Train Loss: {avg_train_loss:.4f}")

Epoch 1/3, Step 1/3125, Loss: 0.0766
Epoch 1/3, Step 501/3125, Loss: 0.5096


In [None]:
from sklearn.metrics import classification_report
import numpy as np

def get_model_accuracy(test_data, model):
    # Initialize the prediction and label list
    y_pred = np.zeros(0)
    y_true = np.zeros(0)

    model.eval()
    for step, batch in enumerate(test_data):

        if step % 500 == 0:
          print(f"Step {step}/{len(test_data)} processing...")

        batch = {k: v.to(device) for k, v in batch.items() if k!="text"}

        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        y_pred = np.concatenate((y_pred, predictions.cpu().numpy()))
        y_true = np.concatenate((y_true, batch["labels"].cpu().numpy()))


    report = classification_report(y_true, y_pred)
    print(report)

In [None]:
get_model_accuracy(test_dataloader, model)

In [None]:
model.save_pretrained('./output/')

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("output", num_labels=2)

In [None]:
text = "I absolutely loved this movie!"
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
outputs = model(**inputs)
predictions = outputs.logits.argmax(dim=-1)
sentiment = 'positive' if predictions == 1 else 'negative'

print(f"Sentiment: {sentiment}")