In [None]:
!pip install datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader
import random
from datasets import load_dataset, concatenate_datasets, ClassLabel
from datasets import load_dataset, concatenate_datasets
import random

# Load the dataset
dataset = load_dataset('imdb')

# Function to filter out examples with text longer than 2000 characters
def filter_long_text(example):
    return len(example['text']) <= 1000

# Apply the filter to the train and test splits
filtered_train_dataset = dataset['train'].filter(filter_long_text)
filtered_test_dataset = dataset['test'].filter(filter_long_text)

# Split the test dataset into 90% and 10%
test_size = len(filtered_test_dataset)

# Calculate the number of examples to move to the training set
num_to_add = int(0.9 * test_size)

# Shuffle and select 90% of the test set
indices = list(range(test_size))
random.shuffle(indices)
train_indices = indices[:num_to_add]
test_indices = indices[num_to_add:]

# Select the corresponding examples from the filtered test set
additional_train_set = filtered_test_dataset.select(train_indices)
new_test_set = filtered_test_dataset.select(test_indices)

# Concatenate 90% of the filtered test set with the filtered train set
new_train_set = concatenate_datasets([filtered_train_dataset, additional_train_set])

# Check the results
print("Original train set size:", len(dataset['train']))
print("Original test set size:", len(dataset['test']))

print("Filtered train set size:", len(filtered_train_dataset))
print("Filtered test set size:", len(filtered_test_dataset))

print("New train set size after adding 90% of the test set:", len(new_train_set))
print("New test set size after removing 90% of the test set:", len(new_test_set))

In [None]:
# Load the FLAN-T5 model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Preprocessing function
def preprocess_function(examples):
    # Convert the classification task into a text-to-text task
    inputs = [f"Classify sentiment: {text}" for text in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # Map the sentiment labels to target text (e.g., "positive" or "negative")
    labels = [label for label in examples["label"]]
    label_texts = ["positive" if label == 1 else "negative" for label in labels]
    model_labels = tokenizer(label_texts, max_length=10, truncation=True, padding="max_length").input_ids

    model_inputs["labels"] = model_labels
    return model_inputs

# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=["text", "label"])

In [None]:
def collate_fn(batch):
    input_ids = torch.stack([torch.tensor(item["input_ids"]) for item in batch])
    attention_mask = torch.stack([torch.tensor(item["attention_mask"]) for item in batch])
    labels = torch.stack([torch.tensor(item["labels"]) for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [None]:
train_dataset = tokenized_datasets["train"].shuffle()  # Using a subset for demonstration
eval_dataset = tokenized_datasets["test"].shuffle()

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
eval_dataloader = DataLoader(eval_dataset, batch_size=32, collate_fn=collate_fn)

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)

total_steps = len(train_dataloader) * 10  # Assuming 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
def save_model(model, optimizer, scheduler, epoch, train_losses, eval_losses, filepath):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),  # Ensure model is a valid PyTorch model object
        'optimizer_state_dict': optimizer.state_dict(),  # Ensure optimizer is a valid optimizer object
        'scheduler_state_dict': scheduler.state_dict(),  # Ensure scheduler is a valid scheduler object
        'train_losses': train_losses,  # Ensure train_losses is a list or value
        'eval_losses': eval_losses,  # Ensure eval_losses is a list or value
    }
    torch.save(checkpoint, filepath)
    print(f"Checkpoint saved at {filepath}")

In [None]:
def load_model(model, optimizer, scheduler, filepath):
    checkpoint = torch.load(filepath)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    epoch = checkpoint['epoch']
    train_losses = checkpoint['train_losses']
    eval_losses = checkpoint['eval_losses']
    print(f"Checkpoint loaded from {filepath}")
    return model, optimizer, scheduler, epoch, train_losses, eval_losses

In [None]:
model, optimizer, scheduler, epochs, train_losses, eval_losses = load_model(model, optimizer, scheduler, r'/content/drive/MyDrive/LLM Models/FLAN-T5-base x IMDB/best_model.pth')

In [None]:
print(epochs)

In [None]:
best_eval_loss = float('inf')

for epoch in range(epochs,10):
    torch.cuda.empty_cache()
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        torch.cuda.empty_cache()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss}")

    # Evaluate the model on the validation set
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            torch.cuda.empty_cache()

            eval_loss += outputs.loss.item()

    avg_eval_loss = eval_loss / len(eval_dataloader)
    eval_losses.append(avg_eval_loss)

    # Save the best model based on validation loss
    if avg_eval_loss < best_eval_loss:
        best_eval_loss = avg_eval_loss
        save_model(
            epoch=epoch,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            train_losses=train_losses,
            eval_losses=eval_losses,
            filepath = r'/content/drive/MyDrive/LLM Models/FLAN-T5-base x IMDB/best_model.pth'
        )
        print(f"Best model saved with eval loss {avg_eval_loss:.4f} at epoch {epoch+1}")

    print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {avg_eval_loss}")

In [None]:
epochs = 10

train_losses = []
eval_losses = []
best_eval_loss = float('inf')

for epoch in range(epochs):
    torch.cuda.empty_cache()
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        torch.cuda.empty_cache()

        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss}")

    # Evaluate the model on the validation set
    model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            torch.cuda.empty_cache()

            eval_loss += outputs.loss.item()

    avg_eval_loss = eval_loss / len(eval_dataloader)
    eval_losses.append(avg_eval_loss)

    # Save the best model based on validation loss
    if avg_eval_loss < best_eval_loss:
        best_eval_loss = avg_eval_loss
        save_model(
            epoch=epoch,
            model=model,
            optimizer=optimizer,
            scheduler=scheduler,
            train_losses=train_losses,
            eval_losses=eval_losses,
            filepath = r'/content/drive/MyDrive/LLM Models/FLAN-T5-base x IMDB/best_model.pth'
        )
        print(f"Best model saved with eval loss {avg_eval_loss:.4f} at epoch {epoch+1}")

    print(f"Epoch {epoch+1}/{epochs}, Validation Loss: {avg_eval_loss}")