In [1]:
# sentiment_pipeline.py

# ====================== #
# 1. Import Dependencies #
# ====================== #
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score
import torch
import numpy as np




In [None]:
# 1. Clean uninstall
!pip uninstall -y fsspec datasets

# 2. Install known compatible versions
!pip install --no-cache-dir "fsspec==2023.6.0" "datasets==2.18.0"

# 3. Restart the runtime AFTER this


In [5]:
!pip install -U transformers




In [7]:
from transformers import BertTokenizer


In [None]:
# sentiment_pipeline.py

# Step 0: Imports


# Step 1: Load the IMDb dataset
dataset = load_dataset("imdb")

# Step 2: Load tokenizer and tokenize data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Step 3: Set format for PyTorch
tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Step 4: Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 5: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=1,
)

# Step 6: Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

# Step 7: Trainer setup
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(10000)),  # smaller subset for faster training
    eval_dataset=tokenized_datasets["test"].select(range(1000)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Step 8: Train the model
trainer.train()

# Step 9: Save the model
model.save_pretrained("sentiment-model")
tokenizer.save_pretrained("sentiment-model")

# Step 10: Load model for inference
def predict_sentiment(text):
    model = BertForSequenceClassification.from_pretrained("sentiment-model")
    tokenizer = BertTokenizer.from_pretrained("sentiment-model")
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return "Positive" if predicted_class == 1 else "Negative"

# Example
print(predict_sentiment("I really loved this movie!"))


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]