In [3]:
pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1

In [4]:
from datasets import load_dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

# Load the IMDb dataset
dataset = load_dataset("imdb")

# Use a smaller subset
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = dataset["test"].shuffle(seed=42).select(range(1000))


# Load the tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Ensure consistent padding and truncation
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize the dataset
tokenized_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = small_eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
)



In [7]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.385909
2,No log,0.327111
3,No log,0.333666


TrainOutput(global_step=189, training_loss=0.3604035957780465, metrics={'train_runtime': 194.1568, 'train_samples_per_second': 15.451, 'train_steps_per_second': 0.973, 'total_flos': 397402195968000.0, 'train_loss': 0.3604035957780465, 'epoch': 3.0})

In [8]:
trainer.evaluate()

{'eval_loss': 0.3336659073829651,
 'eval_runtime': 17.2781,
 'eval_samples_per_second': 57.877,
 'eval_steps_per_second': 3.646,
 'epoch': 3.0}

In [9]:
trainer.save_model("./results")
tokenizer.save_pretrained("./results")

('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/vocab.txt',
 './results/added_tokens.json')

In [None]:
import torch

def predict(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

    # Make sure the model is in evaluation mode
    model.eval()

    # Forward pass, get logits
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get the predicted class
    predicted_class_id = logits.argmax().item()

    # Convert the predicted class id to the corresponding label (0 for negative, 1 for positive)
    label_map = {0: "Negative", 1: "Positive"}
    predicted_label = label_map[predicted_class_id]

    return predicted_label


In [10]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load the fine-tuned model and tokenizer from the "./results" directory
model_path = "./results"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

In [11]:
def predict(text):
    # Tokenize the input text with padding and truncation
    inputs = tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)

    # Ensure the model is in evaluation mode
    model.eval()

    # Perform a forward pass to get logits
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get the predicted class (0 for negative, 1 for positive)
    predicted_class_id = logits.argmax().item()

    # Convert the predicted class id to the corresponding label
    label_map = {0: "Negative", 1: "Positive"}
    predicted_label = label_map[predicted_class_id]

    return predicted_label


In [12]:
# Example texts to test
test_texts = [
    "This movie was absolutely fantastic! I loved every minute of it.",
    "The film was boring and I didn't enjoy it at all.",
    "An amazing performance by the lead actor, really made the movie stand out!",
    "I wouldn't recommend this movie to anyone. It was a waste of time."
]

# Predict sentiment for each text
for text in test_texts:
    prediction = predict(text)
    print(f"Text: {text}\nPredicted Sentiment: {prediction}\n")


Text: This movie was absolutely fantastic! I loved every minute of it.
Predicted Sentiment: Positive

Text: The film was boring and I didn't enjoy it at all.
Predicted Sentiment: Negative

Text: An amazing performance by the lead actor, really made the movie stand out!
Predicted Sentiment: Positive

Text: I wouldn't recommend this movie to anyone. It was a waste of time.
Predicted Sentiment: Negative

