In [1]:
 import argparse
 
 import torch
 from torch.optim import AdamW
 from torch.utils.data import DataLoader
 from torch import nn
 
 
 import evaluate
 from accelerate import Accelerator, DistributedType
 from datasets import load_dataset
 from transformers import AutoModelForSequenceClassification, BertForSequenceClassification, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
 
 import numpy as np
 import pandas as pd
 

In [10]:
def process_data(input_file, output_file, class_limit=694):
  """
  Processes a text file with comma-separated strings and labels, keeping 694 strings per class.

  Args:
    input_file: Path to the input text file.
    output_file: Path to the output text file.
    class_limit: Maximum number of strings to keep per class (default: 694).

  Returns:
    The path to the output file.
  """
  class_counts = {str(i): 0 for i in range(5)}

  with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
    for line in infile:
      # Remove leading/trailing whitespace (including newlines)
      line = line.strip()
      if line:  # Check if line is not empty after stripping
        string, label = line.rsplit(',', 1)
        label = str(int(label))

        if class_counts[label] < class_limit:
          outfile.write(f"{string},{label}\n")
          class_counts[label] += 1

  return output_file




In [11]:
if __name__ == "__main__":
  input_file = "../raw_data/final_data/labeled.txt"  # Replace with your input file path
  output_file = "../raw_data/final_data/balanced_labeled.txt"  # Replace with your desired output file path
  processed_file = process_data(input_file, output_file)
  print(f"Data processed and saved to: {processed_file}")

Data processed and saved to: ../raw_data/final_data/balanced_labeled.txt


In [18]:
import random

def split_dataset(input_file, output_file1, output_file2):
  """
  Splits a dataset text file into two new files with an 80/20 split.

  Args:
    input_file: Path to the input text file containing the dataset.
    output_file1: Path to the first output file (80% of the data).
    output_file2: Path to the second output file (20% of the data).
  """
  with open(input_file, 'r') as infile, open(output_file1, 'w') as outfile1, open(output_file2, 'w') as outfile2:
    for line in infile:
      # Split the line based on the last comma (assuming class label)
      text, label = line.rsplit(",", 1)
      # Randomly choose output file with 80% probability for file1
      if random.random() < 0.8:
        outfile1.write(f"{text},{label}")
      else:
        outfile2.write(f"{text},{label}")

# Example usage
input_file = "../raw_data/final_data/balanced_labeled.txt" # Replace with your input file path
train_file = "../raw_data/final_data/labeled/train_labeled.txt"  # Replace with your desired output training file path
test_file = "../raw_data/final_data/labeled/test_labeled.txt"   # Replace with your desired output test file path
split_dataset(input_file, train_file, test_file)
print(f"Dataset split into {train_file} and {test_file}")

Dataset split into ../raw_data/final_data/train_labeled.txt and ../raw_data/final_data/test_labeled.txt


In [19]:
 class TextClassificationDataset(torch.utils.data.Dataset):
 
     def __init__(self, filename):
         self.text = []
         self.labels = []
         with open(filename, 'r') as f:
           for line in f:
             text, label = self.split_by_last_comma(line)
             self.text.append(text)
             self.labels.append(torch.tensor(int(label[0])))
         self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
         self.max_length = 100
 
     def __len__(self):
         return len(self.text)
 
     def __getitem__(self, idx):
         text = self.text[idx]
         label = self.labels[idx]
         encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', 
                                   truncation=True)
         return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(),
                 'label': label}
 
     def split_by_last_comma(self, text):
       return text.rsplit(",", 1) if "," in text else None

In [21]:
train_file = "../raw_data/final_data/labeled/train_labeled.txt"  # Replace with your desired output training file path
test_file = "../raw_data/final_data/labeled/test_labeled.txt"   # Replace with your desired output test file path
train_dataset = TextClassificationDataset(train_file)
eval_dataset = TextClassificationDataset(test_file)

In [29]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=32, shuffle=True)

In [30]:
device = "mps"
model_name = "bert-base-uncased"  # Replace with your desired BERT model
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)  # Adjust num_labels for your classification task
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)  # Adjust learning rate (lr)
model.device

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


device(type='mps', index=0)

In [31]:
from tqdm.auto import tqdm

from sklearn.metrics import f1_score

def evaluate(model, eval_dataloader):
  model.eval()  # Set model to evaluation mode
  num_correct = 0
  num_samples = 0
  f1_scores = torch.zeros(4)
  with torch.no_grad():  # Disable gradient calculation for evaluation
    for batch in eval_dataloader:
      #input_ids, attention_mask, labels = batch
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["label"].to(device) 
      outputs = model(input_ids, attention_mask=attention_mask)
      predictions = torch.argmax(outputs.logits, dim=1)  # Get predicted class labels
      print(labels)
      print(predictions)

      num_correct += torch.sum(predictions == labels).item()
      num_samples += labels.size(0)  # Count total samples
      for i in range(4):  # Assuming 4 classes (0 to 3)
        f1_scores[i] += f1_score(labels.cpu() == i, predictions.cpu() == i, average='binary')
  eval_acc = num_correct / num_samples
  f1_scores /= len(eval_dataloader)
  print(f"Evaluation Accuracy: {eval_acc:.4f}")
  print(f"F1 Scores: {f1_scores.tolist()}")
  return eval_acc  # Optional: Return accuracy for potential early stopping

# eval_acc = evaluate(model, train_dataloader)  # Evaluate on validation set
# # Optional: Early stopping based on evaluation metric (e.g., eval_acc)

for epoch in range(1):
  with tqdm(train_dataloader, unit="batch") as pbar:
    model.train()
    for batch in pbar:
      #print(batch["input_ids"])
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["label"].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
      predictions = torch.argmax(outputs.logits, dim=1)
      #print(len(labels))
      print("labels " + str(labels))
      #print(len(predictions))
      print("predictions: " + str(predictions))
      loss = outputs.loss
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()

      pbar.set_description(f"Epoch {epoch+1} - Loss: {loss.item():.4f}")

  # Evaluation step
  eval_acc = evaluate(model, train_dataloader)  # Evaluate on validation set
  # Optional: Early stopping based on evaluation metric (e.g., eval_acc)
  pbar.set_description(f"Epoch {epoch+1} - Loss: {loss.item():.4f} - Eval Acc: {eval_acc:.4f}")

  0%|          | 0/139 [00:00<?, ?batch/s]

labels tensor([0, 0, 1, 1, 2, 0, 2, 2, 3, 1, 1, 1, 0, 3, 3, 0], device='mps:0')
predictions: tensor([0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2], device='mps:0')
labels tensor([0, 3, 0, 2, 2, 2, 0, 2, 2, 1, 0, 2, 3, 2, 1, 2], device='mps:0')
predictions: tensor([2, 2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2], device='mps:0')
labels tensor([3, 3, 3, 2, 0, 3, 2, 2, 1, 1, 0, 1, 2, 2, 1, 0], device='mps:0')
predictions: tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 0, 2, 2], device='mps:0')
labels tensor([3, 3, 0, 0, 0, 0, 0, 3, 1, 3, 3, 0, 2, 1, 2, 2], device='mps:0')
predictions: tensor([2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 1, 2, 1, 2, 2], device='mps:0')
labels tensor([3, 1, 1, 0, 3, 0, 3, 0, 0, 1, 1, 3, 1, 3, 1, 3], device='mps:0')
predictions: tensor([2, 1, 1, 2, 0, 2, 2, 2, 2, 2, 3, 3, 1, 3, 1, 2], device='mps:0')
labels tensor([2, 0, 2, 0, 1, 2, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2], device='mps:0')
predictions: tensor([2, 2, 2, 0, 1, 3, 1, 1, 3, 1, 2, 3, 2, 3, 1, 2], device='mps:0')
labe

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


tensor([0, 1, 0, 3, 3, 3, 3, 3, 2, 1, 1, 2, 2, 2, 2, 1], device='mps:0')
tensor([0, 1, 0, 3, 3, 3, 2, 3, 2, 1, 1, 2, 2, 2, 2, 1], device='mps:0')
tensor([2, 0, 3, 3, 3, 2, 0, 2, 2, 1, 2, 3, 0, 1, 0, 0], device='mps:0')
tensor([2, 0, 3, 3, 3, 2, 0, 2, 2, 1, 2, 3, 0, 1, 0, 0], device='mps:0')
tensor([0, 2, 0, 0, 0, 1, 1, 0, 2, 2, 3, 1, 3, 2, 1, 1], device='mps:0')
tensor([0, 2, 0, 0, 0, 1, 1, 0, 2, 2, 3, 1, 3, 2, 1, 1], device='mps:0')
tensor([3, 3, 2, 0, 3, 2, 3, 2, 2, 0, 0, 1, 1, 2, 1, 0], device='mps:0')
tensor([3, 3, 2, 0, 3, 2, 3, 2, 3, 0, 0, 1, 1, 2, 1, 0], device='mps:0')
tensor([2, 1, 0, 3, 2, 0, 1, 2, 1, 3, 2, 3, 3, 2, 1, 2], device='mps:0')
tensor([2, 1, 0, 3, 2, 0, 1, 2, 1, 3, 2, 3, 3, 2, 1, 3], device='mps:0')
tensor([1, 1, 1, 2, 0, 0, 2, 2, 3, 3, 1, 1, 3, 2, 3, 3], device='mps:0')
tensor([1, 1, 1, 2, 0, 0, 2, 2, 3, 3, 1, 1, 3, 2, 3, 3], device='mps:0')
tensor([1, 0, 2, 1, 1, 3, 1, 3, 1, 1, 1, 0, 0, 0, 2, 3], device='mps:0')
tensor([1, 0, 2, 1, 1, 3, 1, 3, 1, 1, 1, 0, 0, 0, 2

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


tensor([2, 2, 2, 1, 1, 0, 3, 1, 2, 0, 0, 1, 1, 1, 3, 0], device='mps:0')
tensor([2, 2, 2, 1, 1, 0, 3, 1, 2, 0, 0, 1, 1, 1, 3, 0], device='mps:0')
tensor([0, 2, 2, 2, 3, 2, 0, 2, 2, 1, 1, 3, 0, 3, 2, 2], device='mps:0')
tensor([0, 2, 2, 2, 3, 2, 0, 2, 2, 1, 1, 3, 0, 3, 2, 2], device='mps:0')
tensor([0, 2, 3, 0, 0, 0, 3, 2, 3, 0, 2, 1, 1, 2, 0, 0], device='mps:0')
tensor([0, 2, 3, 0, 0, 0, 3, 2, 3, 0, 2, 1, 1, 2, 0, 0], device='mps:0')
tensor([1, 1, 1, 3, 0, 3, 0, 2, 1, 2, 2, 1, 2, 3, 2, 1], device='mps:0')
tensor([1, 1, 1, 3, 0, 3, 0, 2, 1, 3, 2, 1, 2, 3, 2, 1], device='mps:0')
tensor([1, 1, 2, 1, 3, 1, 1, 2, 1, 3, 3, 0, 2, 0, 2, 2], device='mps:0')
tensor([1, 1, 2, 1, 3, 1, 1, 2, 1, 3, 3, 0, 2, 0, 2, 2], device='mps:0')
tensor([1, 2, 2, 2, 2, 0, 0, 1, 0, 3, 2, 2, 3, 1, 3, 2], device='mps:0')
tensor([1, 2, 2, 2, 2, 0, 0, 1, 0, 3, 2, 2, 3, 1, 3, 2], device='mps:0')
tensor([1, 2, 3, 2, 3, 3, 1, 1, 1, 2, 2, 1, 3, 1, 0, 1], device='mps:0')
tensor([1, 2, 3, 2, 3, 3, 1, 1, 1, 2, 2, 1, 3, 1, 0

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


tensor([0, 0, 3, 3, 1, 3, 3, 1, 2, 3, 2, 2, 1, 0, 3, 1], device='mps:0')
tensor([0, 0, 3, 3, 1, 3, 3, 1, 2, 3, 2, 2, 1, 0, 3, 1], device='mps:0')
tensor([3, 2, 1, 3, 1, 0, 0, 0, 2, 3, 1, 3, 0, 3, 3, 3], device='mps:0')
tensor([3, 2, 1, 3, 1, 0, 0, 0, 2, 3, 1, 3, 0, 3, 3, 3], device='mps:0')
tensor([3, 2, 2, 1, 0, 0, 0, 3, 0, 1, 2, 2, 2, 3, 1, 0], device='mps:0')
tensor([3, 2, 2, 1, 0, 0, 0, 3, 0, 1, 3, 2, 2, 3, 1, 0], device='mps:0')
tensor([0, 0, 2, 2, 2, 3, 3, 3, 2, 1, 0, 2, 1, 2, 3, 2], device='mps:0')
tensor([0, 0, 2, 2, 2, 3, 3, 3, 2, 1, 0, 2, 1, 2, 3, 2], device='mps:0')
tensor([0, 1, 1, 2, 3, 3, 2, 2, 0, 0, 1, 3, 1, 2, 0, 1], device='mps:0')
tensor([0, 1, 1, 2, 3, 3, 2, 2, 0, 0, 1, 3, 1, 2, 0, 1], device='mps:0')
tensor([1, 0, 1, 1, 3, 3, 2, 3, 3, 0, 0, 1, 2, 0, 2, 0], device='mps:0')
tensor([1, 0, 1, 1, 3, 3, 2, 3, 3, 0, 0, 1, 2, 0, 2, 0], device='mps:0')
tensor([0, 0, 2, 0, 0, 1, 0, 3, 1, 2, 0, 3, 3, 2, 1, 2], device='mps:0')
tensor([0, 0, 2, 0, 0, 1, 0, 3, 1, 2, 0, 3, 3, 2, 1