In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
project_path = '/content/drive/MyDrive/NLP Projects/Sentiment analysis'

In [3]:
import os
import torch
import pandas as pd

# Function to load all saved batches and return separate DataFrames for texts and labels
def load_and_separate_batches(file_prefix):
    directory = f"{project_path}/Dataset/processed_{file_prefix}_data/"

    combined_texts = []
    combined_labels = []

    # Iterate through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.pt'):  # Check if the file is a .pt file
            # Load the batch
            batch_data = torch.load(os.path.join(directory, filename), weights_only=True)
            # Append input_ids (texts) and labels to their respective lists
            combined_texts.append(batch_data['input_ids'])
            combined_labels.append(batch_data['labels'])

    # Concatenate all batches into a single tensor for texts and labels
    combined_texts_tensor = torch.cat(combined_texts, dim=0)
    combined_labels_tensor = torch.cat(combined_labels, dim=0)

    # Convert to DataFrames
    texts_df = pd.DataFrame(combined_texts_tensor.numpy(), columns=[f'input_id_{i}' for i in range(combined_texts_tensor.size(1))])
    labels_df = pd.DataFrame(combined_labels_tensor.numpy(), columns=['labels'])

    return texts_df, labels_df

# Load and separate train, validation, and test datasets
train_texts, train_labels = load_and_separate_batches(file_prefix="train")
validation_texts, validation_labels = load_and_separate_batches(file_prefix="validation")
test_texts, test_labels = load_and_separate_batches(file_prefix="test")

In [11]:
import torch
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, texts_df, labels_df):
        self.texts = texts_df.values  # Convert DataFrame to numpy array
        self.labels = labels_df.values.flatten()  # Convert to 1D array

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.texts[idx]  # Get the input_ids for the sample
        label = self.labels[idx]  # Get the corresponding label
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),  # Convert to tensor
            'labels': torch.tensor(label, dtype=torch.long)  # Convert to tensor
        }

In [12]:
# Create instances of the SentimentDataset for training and validation
train_dataset = SentimentDataset(train_texts, train_labels)
validation_dataset = SentimentDataset(validation_texts, validation_labels)
test_dataset = SentimentDataset(test_texts, test_labels)

In [13]:
# Example: Accessing a sample from the training dataset
sample = train_dataset[0]
print("Sample Input IDs:", sample['input_ids'])
print("Sample Label:", sample['labels'])

Sample Input IDs: tensor([  101,  6986, 24266,  2038,  2070,  2307, 18012,  1999,  2023,  2338,
         1998,  1045,  4342,  1037,  2843,  2013,  2009,  2174,  2009,  2003,
         2763,  1996,  2087, 16267, 10634,  3191,  1045,  2031,  3191,  1999,
         2086,  2005,  2296,  2048,  5530,  1997,  2524,  2000,  3191,  3430,
         2017,  2131,  2055,  2028,  2030,  2048, 11746,  1997,  2995, 20296,
         1045,  2371,  2066,  1045,  2001,  2067,  1999,  5624, 11818,  3752,
        11771,  2808,  2000,  3046,  1998,  2131,  1996,  2261,  2204,  5167,
         2041,  1997,  2068,  2077,  1996,  3231,  1996,  2168, 16289, 27958,
         2140,  2071,  2031,  2042,  3139,  1999,  2055,  5530,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,    

In [14]:
print(f"Train data DataFrame: {len(train_texts)} - {len(train_labels)}")
print(f"Validation data DataFrame: {len(validation_texts)} - {len(validation_labels)}")
print(f"Test data DataFrame: {len(test_texts)} - {len(test_labels)}")

Train data DataFrame: 950000 - 950000
Validation data DataFrame: 50000 - 50000
Test data DataFrame: 100000 - 100000


In [None]:
import os
import torch
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer

# Disable NCCL P2P and InfiniBand for RTX 4000 series compatibility
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir=f'{project_path}/results',
    num_train_epochs=1,  # Set epochs to 1
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'{project_path}/logs',
    logging_steps=2000,  # Change step size from 500 to 2000
    logging_first_step=True,
    eval_strategy="steps",
    eval_steps=2000,  # Change eval step size from 500 to 2000
    save_steps=2000,  # Change save step size from 500 to 2000
    fp16=True if torch.cuda.is_available() else False,
    save_total_limit=2,
    report_to="none",
    learning_rate=5e-5,  # Set learning rate
    gradient_accumulation_steps=2,  # Gradient accumulation to simulate larger batch size
    load_best_model_at_end=True,  # Load the best model based on evaluation metric
    metric_for_best_model="eval_loss"  # Monitor validation loss
)

# Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)

# Train the model
trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss,Validation Loss
2000,0.3286,0.228453
4000,0.2434,0.230495
6000,0.227,0.282382


In [None]:
trainer.evaluate(eval_dataset=test_dataset)