# Setup

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
import torch
from torch.nn.functional import softmax
import datasets
from datasets import load_dataset, Dataset
import random
from torch.utils.data import DataLoader
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
from torchmetrics import F1Score
import pandas as pd


# Run

### Try the model

In [3]:
tokenizer = AutoTokenizer.from_pretrained("yash3056/Llama-3.2-1B-imdb")
model = AutoModelForSequenceClassification.from_pretrained("yash3056/Llama-3.2-1B-imdb", num_labels=3, ignore_mismatched_sizes=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at yash3056/Llama-3.2-1B-imdb and are newly initialized because the shapes did not match:
- score.weight: found shape torch.Size([2, 2048]) in the checkpoint and torch.Size([3, 2048]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Fine tune the model

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import random
import torch
from torch.utils.data import DataLoader

# Set a fixed random seed for reproducibility
random.seed(42)

# Load the dataset from Hugging Face
dataset = load_dataset("kisejin/finance_sentiment_dataset")

# Convert string labels to integers
label_mapping = {"negative": 0, "neutral": 1, "positive": 2}

# Split data by sentiment label
negative_samples = [sample for sample in dataset["train"] if sample["output"] == "negative"]
positive_samples = [sample for sample in dataset["train"] if sample["output"] == "positive"]
neutral_samples = [sample for sample in dataset["train"] if sample["output"] == "neutral"]

# Get the number of samples to balance
num_samples = 3000

# Randomly select the same number of positive and neutral samples
balanced_negative_samples = random.sample(negative_samples, num_samples)
balanced_positive_samples = random.sample(positive_samples, num_samples)
balanced_neutral_samples = random.sample(neutral_samples, num_samples)

# Combine all into a new dataset
balanced_dataset = balanced_negative_samples + balanced_positive_samples + balanced_neutral_samples

# Convert 'output' field to integers
for sample in balanced_dataset:
    sample['output'] = label_mapping[sample['output']]

# Convert it to a Hugging Face dataset for compatibility
balanced_dataset = Dataset.from_dict({k: [d[k] for d in balanced_dataset] for k in balanced_dataset[0].keys()})

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("yash3056/Llama-3.2-1B-imdb")
model = AutoModelForSequenceClassification.from_pretrained("yash3056/Llama-3.2-1B-imdb", num_labels=3, ignore_mismatched_sizes=True)

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["input"], padding="max_length", truncation=True)

# Tokenize the dataset
tokenized_datasets = balanced_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["instruction", "input"])
tokenized_datasets = tokenized_datasets.rename_column("output", "labels")
tokenized_datasets.set_format("torch")

# Split into training and evaluation datasets
shuffled_train_dataset = tokenized_datasets.shuffle(seed=42)
train_size = int(0.8 * len(shuffled_train_dataset))  # 80% for training
small_train_dataset = shuffled_train_dataset.select(range(train_size))
small_eval_dataset = shuffled_train_dataset.select(range(train_size, len(shuffled_train_dataset)))

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=1
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
)

# Fine-tune the model
trainer.train()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at yash3056/Llama-3.2-1B-imdb and are newly initialized because the shapes did not match:
- score.weight: found shape torch.Size([2, 2048]) in the checkpoint and torch.Size([3, 2048]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


  0%|          | 0/1350 [00:00<?, ?it/s]

RuntimeError: Invalid buffer size: 16.00 GB

configure.json backup:
{
  "_name_or_path": "checkpoint-2/fine_tuned_model",
  "architectures": [
    "LlamaForSequenceClassification"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pad_token_id": 128001,
  "pretraining_tp": 1,
  "problem_type": "single_label_classification",
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 128256
}

In [None]:
# # Set a fixed random seed for reproducibility
# random.seed(42)

# # Load the dataset from Hugging Face
# dataset = load_dataset("kisejin/finance_sentiment_dataset")

# # Convert string labels to integers
# label_mapping = {"negative": 0, "neutral": 1, "positive": 2}

# # Split data by sentiment label
# negative_samples = [sample for sample in dataset["train"] if sample["output"] == "negative"]
# positive_samples = [sample for sample in dataset["train"] if sample["output"] == "positive"]
# neutral_samples = [sample for sample in dataset["train"] if sample["output"] == "neutral"]

# # Get the number of negative samples to balance
# num_samples = 3000 # len(negative_samples)

# # Randomly select same number of positive and neutral samples
# balanced_negative_samples = random.sample(negative_samples, num_samples)
# balanced_positive_samples = random.sample(positive_samples, num_samples)
# balanced_neutral_samples = random.sample(neutral_samples, num_samples)

# # Combine all into a new dataset
# balanced_dataset = balanced_negative_samples + balanced_positive_samples + balanced_neutral_samples

# # Convert 'output' field to integers
# for sample in balanced_dataset:
#     sample['output'] = label_mapping[sample['output']]

# # Optional: Convert it to a Hugging Face dataset for compatibility
# from datasets import Dataset
# balanced_dataset = Dataset.from_dict({k: [d[k] for d in balanced_dataset] for k in balanced_dataset[0].keys()})

In [None]:
# # Load the tokenizer and model
# tokenizer = AutoTokenizer.from_pretrained("yash3056/Llama-3.2-1B-imdb")
# model = AutoModelForSequenceClassification.from_pretrained(
#     "yash3056/Llama-3.2-1B-imdb",
#     num_labels=3,
#     ignore_mismatched_sizes=True,
#     low_cpu_mem_usage=True
# )

# # Tokenization function
# def tokenize_function(examples):
#     return tokenizer(examples["input"], padding="max_length", truncation=True)

# # Tokenize the dataset
# tokenized_datasets = balanced_dataset.map(tokenize_function, batched=True)
# tokenized_datasets = tokenized_datasets.remove_columns(["instruction", "input"])
# tokenized_datasets = tokenized_datasets.rename_column("output", "labels")
# tokenized_datasets.set_format("torch")

# # Split into training and evaluation datasets
# shuffled_train_dataset = tokenized_datasets.shuffle(seed=42)
# train_size = int(0.8 * len(shuffled_train_dataset))  # 80% for training
# small_train_dataset = shuffled_train_dataset.select(range(train_size))
# small_eval_dataset = shuffled_train_dataset.select(range(train_size, len(shuffled_train_dataset)))

# # Dataloaders
# train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
# eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

# # Set up optimizer and scheduler
# optimizer = AdamW(model.parameters(), lr=1e-5)
# num_epochs = 3
# num_training_steps = num_epochs * len(train_dataloader)
# lr_scheduler = get_scheduler(
#     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
# )

# # Move model to device (GPU if available)
# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)

# # Initialize lists to track training/validation losses and accuracies
# train_losses = []
# val_losses = []
# val_accuracies = []

# # Training loop
# progress_bar = tqdm(range(num_training_steps))

# for epoch in range(num_epochs):
#     epoch_train_loss = 0
#     epoch_val_loss = 0
#     correct_predictions = 0
#     total_predictions = 0
#     model.train()

#     for batch in train_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()

#         epoch_train_loss += loss.item()
#         progress_bar.update(1)

#     # Record training loss for the epoch
#     train_losses.append(epoch_train_loss / len(train_dataloader))

#     # Evaluate the model
#     model.eval()
#     for batch in eval_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         with torch.no_grad():
#             outputs = model(**batch)

#         logits = outputs.logits
#         predictions = torch.argmax(logits, dim=-1)
#         loss = F.cross_entropy(logits, batch["labels"])

#         epoch_val_loss += loss.item()
#         correct_predictions += (predictions == batch["labels"]).sum().item()
#         total_predictions += batch["labels"].size(0)

#     # Record validation loss and accuracy
#     val_losses.append(epoch_val_loss / len(eval_dataloader))
#     val_accuracy = correct_predictions / total_predictions
#     val_accuracies.append(val_accuracy)

#     print(f"Epoch {epoch + 1}/{num_epochs}: train loss {train_losses[-1]:.4f}, val loss {val_losses[-1]:.4f}, val accuracy {val_accuracies[-1]:.4f}")

# # Plotting function
# def eval_plot(train_losses, val_losses, val_accuracies):
#     epochs = range(1, len(train_losses) + 1)

#     plt.figure(figsize=(12, 6))

#     # Training and validation loss
#     plt.subplot(1, 2, 1)
#     plt.plot(epochs, train_losses, label="Training Loss")
#     plt.plot(epochs, val_losses, label="Validation Loss")
#     plt.xlabel('Epoch')
#     plt.ylabel('Loss')
#     plt.legend()
#     plt.title('Training and Validation Loss')

#     # Validation accuracy
#     plt.subplot(1, 2, 2)
#     plt.plot(epochs, val_accuracies, label="Validation Accuracy")
#     plt.xlabel('Epoch')
#     plt.ylabel('Accuracy')
#     plt.legend()
#     plt.title('Validation Accuracy')

#     plt.tight_layout()
#     plt.show()

# # Plot train loss, validation loss, validation accuracy
# eval_plot(train_losses, val_losses, val_accuracies)

# # Print final validation accuracy
# print(f"Final validation accuracy: {val_accuracies[-1]:.4f}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at yash3056/Llama-3.2-1B-imdb and are newly initialized because the shapes did not match:
- score.weight: found shape torch.Size([2, 2048]) in the checkpoint and torch.Size([3, 2048]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/9000 [00:00<?, ? examples/s]



NotImplementedError: Cannot copy out of meta tensor; no data! Please use torch.nn.Module.to_empty() instead of torch.nn.Module.to() when moving module from meta to a different device.