In [1]:
!pip install transformers datasets accelerate



In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load SST-2 dataset
dataset = load_dataset("glue", "sst2")

# Shuffle and select 1000 samples for training
train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
validation_dataset = dataset["validation"]

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding="max_length", truncation=True)

# Tokenize train and validation datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_validation = validation_dataset.map(tokenize_function, batched=True)

# Prepare datasets for PyTorch
tokenized_train = tokenized_train.remove_columns(["sentence", "idx"]).rename_column("label", "labels")
tokenized_train.set_format("torch")

tokenized_validation = tokenized_validation.remove_columns(["sentence", "idx"]).rename_column("label", "labels")
tokenized_validation.set_format("torch")

print("Train Dataset Size:", len(tokenized_train))
print("Validation Dataset Size:", len(tokenized_validation))


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Train Dataset Size: 1000
Validation Dataset Size: 872


In [3]:
from transformers import AutoModelForSequenceClassification

# Load DistilBERT model for binary classification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",             # Output directory for model checkpoints
    evaluation_strategy="epoch",        # Evaluate at the end of each epoch
    logging_dir="./logs",               # Log directory
    logging_steps=10,                   # Log every 10 steps
    per_device_train_batch_size=8,      # Batch size for training
    per_device_eval_batch_size=8,       # Batch size for evaluation
    num_train_epochs=3,                 # Number of epochs
    learning_rate=2e-5,                 # Learning rate
    weight_decay=0.01,                  # Weight decay
    save_strategy="epoch",              # Save the model at the end of each epoch
    report_to="none",                   # Disable W&B logging
    disable_tqdm=False,                 # Enable progress bar
)



In [5]:
from transformers import Trainer

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,  # Ensures proper preprocessing
)

# Train the model

  trainer = Trainer(


In [6]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.3749,0.367653
2,0.1692,0.397799
3,0.1332,0.466175


TrainOutput(global_step=375, training_loss=0.3227980575561523, metrics={'train_runtime': 112.3016, 'train_samples_per_second': 26.714, 'train_steps_per_second': 3.339, 'total_flos': 397402195968000.0, 'train_loss': 0.3227980575561523, 'epoch': 3.0})

In [7]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.466174840927124, 'eval_runtime': 6.5972, 'eval_samples_per_second': 132.178, 'eval_steps_per_second': 16.522, 'epoch': 3.0}


In [8]:
# Save model and tokenizer
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [9]:
from transformers import pipeline

# Load the fine-tuned model and tokenizer
sentiment_analyzer = pipeline("text-classification", model="./fine_tuned_model", tokenizer="./fine_tuned_model")

# Inference on a sample sentence
result = sentiment_analyzer("This movie was absolutely fantastic!")
print("Sentiment:", result)

Device set to use cuda:0


Sentiment: [{'label': 'LABEL_1', 'score': 0.9892991781234741}]


In [11]:
import pandas as pd

# Convert the original subset (non-tokenized) to a DataFrame
subset_df = pd.DataFrame(train_dataset)

# Save the DataFrame as a CSV file
subset_df.to_csv("sst2_subset.csv", index=False)

print("Dataset saved as 'sst2_subset.csv'")



Dataset saved as 'sst2_subset.csv'
