In [4]:
!pip install transformers datasets accelerate



In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load SST-2 dataset
dataset = load_dataset("glue", "sst2")

# Shuffle and select 1000 samples for training
train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
validation_dataset = dataset["validation"]

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

# Tokenize train and validation datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_validation = validation_dataset.map(tokenize_function, batched=True)

# Prepare datasets for PyTorch
tokenized_train = tokenized_train.remove_columns(["sentence", "idx"]).rename_column("label", "labels")
tokenized_train.set_format("torch")

tokenized_validation = tokenized_validation.remove_columns(["sentence", "idx"]).rename_column("label", "labels")
tokenized_validation.set_format("torch")

print("Train Dataset Size:", len(tokenized_train))
print("Validation Dataset Size:", len(tokenized_validation))


Train Dataset Size: 1000
Validation Dataset Size: 872


In [6]:
from transformers import AutoModelForSequenceClassification

# Load DistilBERT model for binary classification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from transformers import TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",             # Output directory for model checkpoints
    evaluation_strategy="epoch",        # Evaluate at the end of each epoch
    logging_dir="./logs",               # Log directory
    logging_steps=10,                   # Log every 10 steps
    per_device_train_batch_size=8,      # Batch size for training
    per_device_eval_batch_size=8,       # Batch size for evaluation
    num_train_epochs=3,                 # Number of epochs
    learning_rate=2e-5,                 # Learning rate
    weight_decay=0.01,                  # Weight decay
    save_strategy="epoch",              # Save the model at the end of each epoch
    report_to="none",                   # Disable W&B logging
    disable_tqdm=False,                 # Enable progress bar
)



In [8]:
from transformers import Trainer

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,  # Ensures proper preprocessing
)

# Train the model

  trainer = Trainer(


In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3938,0.358138
2,0.1556,0.395551
3,0.0904,0.447215


TrainOutput(global_step=375, training_loss=0.30577046497662863, metrics={'train_runtime': 112.9198, 'train_samples_per_second': 26.568, 'train_steps_per_second': 3.321, 'total_flos': 397402195968000.0, 'train_loss': 0.30577046497662863, 'epoch': 3.0})

In [10]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.4472149908542633, 'eval_runtime': 6.5932, 'eval_samples_per_second': 132.257, 'eval_steps_per_second': 16.532, 'epoch': 3.0}


In [11]:
# Save model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [12]:
from transformers import pipeline

# Load the fine-tuned model and tokenizer
sentiment_analyzer = pipeline("text-classification", model="./fine_tuned_model", tokenizer="./fine_tuned_model")

# Inference on a sample sentence
result = sentiment_analyzer("This movie was absolutely fantastic!")
print("Sentiment:", result)

Device set to use cuda:0


Sentiment: [{'label': 'LABEL_1', 'score': 0.9909109473228455}]
