In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("imdb")

In [3]:
dataset.data

{'train': MemoryMappedTable
 text: string
 label: int64
 ----
  "The Unseen". I picked up a used copy of this film because I was interested in seeing more of Bach, whom I'd just viewed in "The Spy Who Loved Me." I love really classically beautiful actresses and appreciate them even more if they can act a little. So: we start with a nice fresh premise. TV reporter Bach walks out on boyfriend and goes to cover a festival in a California town, Solvang, that celebrates its Swedish ancestry by putting on a big folk festival. She brings along a camerawoman, who happens to be her sister, and another associate. (The late Karen Lamm plays Bach's sister, and if you know who the celebrities are that each of these ladies is married to, it is just too funny watching Bach (Mrs. Ringo Starr) and Lamm (Mrs. Dennis Wilson) going down the street having a sisterly quarrel.)) Anyway 
  under-the-weather Vicki slips off her clothes and gets into a nice hot tub, not realizing that Keller has crept into her 

We're gonna try to look at the data treatment

In [4]:
dataset.map(lambda x: x)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
from transformers import (
        AutoTokenizer,
        AutoModelForTokenClassification,
        DataCollatorForTokenClassification,
        Trainer,
        TrainingArguments,
    )

In [6]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForTokenClassification.from_pretrained(
        "gpt2", num_labels=2, ignore_mismatched_sizes=True
    )

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import modal
import wandb
from modal import SharedVolume

In [None]:
save_weights= modal.Volume.from_name("save_weights")
# Define the Modal App
app = modal.App(name="sentiment-classification")

# Define the image with necessary dependencies
image = (
    modal.Image.debian_slim()
    .pip_install(
        "torch",
        "transformers",
        "pandas",
        "datasets",
        "wandb",
        "scikit-learn"
    )
)


@app.function(
    image=image,
    volumes={"/data": save_weights},
    secrets=[modal.Secret.from_name("wandb-secret")],
    gpu="any",  # Use GPU for training
    timeout=86400,  # 24 hours
)
def train():
    import os
    import torch
    from torch.utils.data import DataLoader
    from transformers import (
        AutoTokenizer,
        AutoModelForTokenClassification,
        DataCollatorForTokenClassification,
        Trainer,
        TrainingArguments,
    )
    from datasets import load_dataset
    import wandb

    # Initialize Weights & Biases
    wandb.login(key=os.environ["WANDB_API_KEY"])
    wandb.init(project="sentiment-classification", name="gpt2-token-classification")

    # Load the IMDB dataset
    dataset = load_dataset("imdb")

    # Map sentiment labels to numerical values
    # This data is already 0 or 1
    # def encode_labels(example):
    #     example["label"] = 1 if example["label"] == "pos" else 0
    #     return example

    # dataset = dataset.map(encode_labels)

    # Initialize the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    model = AutoModelForTokenClassification.from_pretrained(
        "gpt2", num_labels=2, ignore_mismatched_sizes=True
    )

    # Tokenize the inputs
    def tokenize_function(examples):
        tokens = tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=512,
        )
        labels = []
        for _ in range(len(tokens["input_ids"])):
            labels.append([examples["label"]] * 512)
        tokens["labels"] = labels
        return tokens

    tokenized_datasets = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"],
    )

    # Set format for PyTorch
    tokenized_datasets.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "labels"],
    )

    # Data collator
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="steps",
        eval_steps=500,
        logging_steps=100,
        save_steps=500,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=1,
        weight_decay=0.01,
        report_to="wandb",
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(2000)),
        eval_dataset=tokenized_datasets["test"].shuffle(seed=42).select(range(500)),
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    # Train the model
    trainer.train()

    # Save the model
    trainer.save_model("/data/sentiment_model")

    # Finish the wandb run
    wandb.finish()


In [None]:
with app:
    train()
    # Train the model