In [None]:
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch, os, re

In [None]:
# Load the data from IMDB dataset
dataset = load_dataset("imdb")

In [None]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Uses the 'distilbert-base-uncased' pretrained model to tokenize the text data
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    # Takes a text and return the tokenized version of it, with exactly 512 tokens
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

# Takes each line from the dataset applies the tokenize function and stores the results
tokenized = dataset.map(tokenize, batched=True)

In [None]:
# Creates the model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

In [None]:
"""
Training Config
- Checkpoints generated for each epoch
- Checkpoints saved at ./results
- Saves the last checkpoint
- Trains for 2 epochs
- Half precision memory when using GPU
- Batch size 16 for train and evaluation per device
- Uses 4 Workers to load data in parallel
- Accumulate gradient for over 2 batches
"""

cuda = torch.cuda.is_available()
args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    save_strategy="epoch",
    save_total_limit=1,
    fp16=cuda,
    dataloader_num_workers=4,
    gradient_accumulation_steps=2,
)

if cuda:
    training_data = tokenized["train"].shuffle()
    evaluation_data = tokenized["test"].shuffle()
else:
    training_data = tokenized["train"].shuffle().select(range(700))
    evaluation_data = tokenized["test"].shuffle().select(range(300))

# Selects the training data, test data, training config and tokenizer model
trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=training_data,
    eval_dataset=evaluation_data
)

# Starts the training
trainer.train()

In [None]:
# Metric used for measuring the efficiency of the model
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Initiate the evaluation
trainer.compute_metrics = compute_metrics
trainer.evaluate()

In [None]:
# Returns the path of last checkpoint
def get_latest_checkpoint(results_dir="./results"):
    checkpoint_dirs = [
        d for d in os.listdir(results_dir)
        if re.match(r"^checkpoint-\d+$", d)
    ]

    if not checkpoint_dirs:
        return None

    latest = max(checkpoint_dirs, key=lambda x: int(x.split("-")[1]))
    return os.path.join(results_dir, latest)

In [None]:
model_dir = get_latest_checkpoint()
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

text = input()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

inputs = tokenizer(text, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

model.eval()
with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)
predicted_class = torch.argmax(probabilities, dim=1).item()

print(f"Result: {bool(predicted_class)} \nProbabilities: {probabilities.tolist()[0]}")