In [7]:
import torch    # type: ignore
import pandas as pd  # type: ignore
from transformers import BertTokenizer, BertForSequenceClassification       # type: ignore
from torch.utils.data import DataLoader, Dataset                                  # type: ignore
from tqdm import tqdm           # type: ignore

In [8]:
# Set the file path and chunk size for large dataset processing
file_path = "multiclass_dataset.csv"  # Replace with your actual file path
chunk_size = 10000  # Define an appropriate chunk size for memory management

# Set device for GPU usage and enable mixed precision if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.float16 if torch.cuda.is_available() else torch.float32  # Use float16 if GPU is available

In [9]:
# Load BERT tokenizer and sentiment model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = torch.nn.DataParallel(model)  # Enable multi-GPU if available
model.to(device).to(dtype)
model.eval()

DataParallel(
  (module): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(105879, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSdpaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=

In [10]:
# Function to tokenize texts in a batch
def batch_tokenize(texts):
    return tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device).to(dtype)

# Function to predict sentiment for a batch of texts
def batch_predict_sentiment(batch_texts):
    inputs = batch_tokenize(batch_texts)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    predicted_classes = torch.argmax(probabilities, dim=1).tolist()

    return predicted_classes  # Return predicted classes directly

In [13]:
# Process the dataset in chunks and save results incrementally
output_file = "updating_" + file_path  # Output file name
for chunk_id, df_chunk in enumerate(pd.read_csv(file_path, chunksize=chunk_size)):
    # Define a dataset and dataloader for the current chunk
    class TextDataset(Dataset):
        def __init__(self, texts):
            self.texts = texts

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, idx):
            return self.texts[idx]

    dataset = TextDataset(df_chunk['Text'].tolist())
    batch_size = 8  # Smaller batch size for large datasets and FP16 mode
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    # Lists to collect labels and sentiment for this chunk
    all_labels = []
    all_sentiments = []

    # Process each batch within the current chunk
    for batch_texts in tqdm(dataloader, desc=f"Processing chunk {chunk_id + 1}", unit="batch"):
        predicted_classes = batch_predict_sentiment(batch_texts)

        # Adjust labels based on the predictions
        for index, pred_class in enumerate(predicted_classes):
            original_label = df_chunk['Label'].iloc[index]  # Get the original label
            if original_label == 1:  # If the original label is Neutral (1)
                if pred_class == 0:  # If the model predicts it as Negative (0)
                    all_labels.append(0)  # Change to Negative (0)
                    all_sentiments.append("Negative")
                else:
                    all_labels.append(1)  # Keep as Neutral (1)
                    all_sentiments.append("Neutral")
            else:
                all_labels.append(original_label)  # Keep other labels (0 or 2)
                if original_label == 0:
                    all_sentiments.append("Negative")
                elif original_label == 2:
                    all_sentiments.append("Positive")
    # Append processed sentiment results to the current DataFrame chunk
    df_chunk['Sentiment'] = all_sentiments
    df_chunk['Label'] = all_labels

    # Save each chunk to the output CSV incrementally
    if chunk_id == 0:
        df_chunk.to_csv(output_file, index=False, mode='w')  # Write header for the first chunk
    else:
        df_chunk.to_csv(output_file, index=False, mode='a', header=False)  # Append without header

print("Processing complete. All chunks saved to", output_file)

Processing chunk 1:   0%|          | 0/1250 [00:00<?, ?batch/s]Attempting to cast a BatchEncoding to type torch.float32. This is not supported.


Processing chunk 1:   0%|          | 1/1250 [00:00<05:23,  3.86batch/s]Attempting to cast a BatchEncoding to type torch.float32. This is not supported.
Processing chunk 1:   0%|          | 2/1250 [00:00<06:00,  3.46batch/s]Attempting to cast a BatchEncoding to type torch.float32. This is not supported.
Processing chunk 1:   0%|          | 3/1250 [00:00<05:46,  3.60batch/s]Attempting to cast a BatchEncoding to type torch.float32. This is not supported.
Processing chunk 1:   0%|          | 4/1250 [00:01<07:21,  2.82batch/s]Attempting to cast a BatchEncoding to type torch.float32. This is not supported.
Processing chunk 1:   0%|          | 5/1250 [00:01<07:33,  2.74batch/s]Attempting to cast a BatchEncoding to type torch.float32. This is not supported.
Processing chunk 1:   0%|          | 6/1250 [00:01<06:39,  3.12batch/s]Attempting to cast a BatchEncoding to type torch.float32. This is not supported.
Processing chunk 1:   1%|          | 7/1250 [00:02<05:48,  3.57batch/s]Attempting to cas

Processing complete. All chunks saved to updating_multiclass_dataset.csv



