✅ Step 1 : Let’s assume you have email texts labeled as:

0 → Neutral

1 → Passive-Aggressive

2 → Aggressive

3 → Polite

✅ Step 2: Tokenize & Load Data

In [5]:
from datasets import load_dataset
from transformers import BertTokenizer
import re
import pandas as pd



In [6]:
# Custom text cleaning function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters except basic punctuation
    text = re.sub(r"[^a-zA-Z0-9,.!?']", " ", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [9]:
from datasets import Dataset 
def load_and_clean_data(filepath):
    # Load as pandas DataFrame for easy cleaning
    df = pd.read_csv(filepath)
    
    # Apply cleaning
    df['cleaned_text'] = df['text'].apply(clean_text)
    
    # Convert back to HuggingFace dataset
    dataset = Dataset.from_pandas(df)
    return dataset

# Load cleaned data
dataset = load_and_clean_data('email_tone.csv')

In [10]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function
def tokenize(batch):
    return tokenizer(
        batch['cleaned_text'], 
        padding='max_length',  # Pad to model's max length (512)
        truncation=True,
        max_length=128  # Or your preferred length
    )

# Process dataset
tokenized_data = dataset.map(tokenize, batched=True)
tokenized_data = tokenized_data.train_test_split(test_size=0.2)

# Verify
print(tokenized_data['train'][0]) 

Map:   0%|          | 0/353 [00:00<?, ? examples/s]

{'text': 'Were you even paying attention?', 'label': 2, 'cleaned_text': 'were you even paying attention?', 'input_ids': [101, 2020, 2017, 2130, 7079, 3086, 1029, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0

✅ Step 4: Load Model

In [11]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Step 5: Training Setup

In [12]:

from transformers import (TrainingArguments, 
                         Trainer,)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

# 6. Training Configuration
training_args = TrainingArguments(
    output_dir="./bert-tone-results",        # Directory to save results
    evaluation_strategy="epoch",             # Evaluate after each epoch
    save_strategy="epoch",                   # Save model after each epoch
    learning_rate=2e-5,                      # Optimal for BERT fine-tuning
    per_device_train_batch_size=8,           # Batch size for training
    per_device_eval_batch_size=8,            # Batch size for evaluation
    num_train_epochs=3,                      # Number of training epochs
    weight_decay=0.01,                       # Regularization
    load_best_model_at_end=True,             # Keep best model
    metric_for_best_model='f1',              # Use F1 to select best model
)

# 7. Trainer Initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 8. Start Training
trainer.train()

RuntimeError: Failed to import transformers.trainer because of the following error (look up to see its traceback):
Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [13]:
pip install tf-keras

^C
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\ALBIN JOHN\\AppData\\Roaming\\Python\\Python312\\site-packages\\tensorflow\\compiler\\mlir\\lite\\python\\_pywrap_converter_api.pyd'
Check the permissions.



Defaulting to user installation because normal site-packages is not writeable
Collecting tf-keras
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow<2.20,>=2.19 (from tf-keras)
  Downloading tensorflow-2.19.0-cp312-cp312-win_amd64.whl.metadata (4.1 kB)
Collecting tensorboard~=2.19.0 (from tensorflow<2.20,>=2.19->tf-keras)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting ml-dtypes<1.0.0,>=0.5.1 (from tensorflow<2.20,>=2.19->tf-keras)
  Downloading ml_dtypes-0.5.1-cp312-cp312-win_amd64.whl.metadata (22 kB)
Downloading tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------------------ --------------------- 0.8/1.7 MB 8.3 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 11.7 MB/s eta 0:00:00
Downloading tensorflow-2.19.0-cp312-cp312-win_amd64.whl (376.0 MB)
   ---------------------------------------- 0.0/376.0 MB ? eta -:--:--
   -------