In [1]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
# Load the IMDb dataset from Hugging Face datasets
dataset = load_dataset("imdb")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [3]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [4]:
# Load pre-trained DistilBERT tokenizer
model_checkpoint = "distilbert-base-uncased"
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained("./sentiment_model")

In [5]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)


In [6]:
# Apply tokenization to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [7]:
# Split into train and test sets
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))  # Use a subset for quick training
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

In [8]:
train_dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2000
})

In [9]:
# Load pre-trained model for classification (DistilBERT)
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained("./sentiment_model")

In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)



In [11]:
# Data collator to batch pad
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
# Define evaluation metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [13]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [14]:
# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1414,0.821141,0.858,0.822878,0.906504,0.862669
2,0.0859,0.832433,0.86,0.825926,0.906504,0.864341
3,0.0002,0.809728,0.88,0.869048,0.890244,0.879518


TrainOutput(global_step=750, training_loss=0.07405987233606477, metrics={'train_runtime': 240.874, 'train_samples_per_second': 24.909, 'train_steps_per_second': 3.114, 'total_flos': 794804391936000.0, 'train_loss': 0.07405987233606477, 'epoch': 3.0})

In [15]:
# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.8097280859947205, 'eval_accuracy': 0.88, 'eval_precision': 0.8690476190476191, 'eval_recall': 0.8902439024390244, 'eval_f1': 0.8795180722891566, 'eval_runtime': 4.6677, 'eval_samples_per_second': 107.119, 'eval_steps_per_second': 13.497, 'epoch': 3.0}


In [16]:
# Save the model
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/vocab.txt',
 './sentiment_model/added_tokens.json',
 './sentiment_model/tokenizer.json')

In [17]:
import torch

def predict_sentiment(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if GPU is available
    model.to(device)  # Move model to the same device

    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Perform inference
    with torch.no_grad():  # No gradient calculation needed for prediction
        outputs = model(**inputs)
    
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()  # Get the class with highest probability

    return "Positive" if prediction == 1 else "Negative"




In [18]:
# Test the function
print(predict_sentiment("I absolutely loved this movie!"))
print(predict_sentiment("The movie was terrible, I hated it."))


Positive
Negative
