In [6]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pickle

# Set random seed for reproducibility
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import pickle

# Set random seed for reproducibility
seed_val = 42
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the training and test datasets
train_data = pd.read_csv('/Users/devshah/Documents/WorkSpace/University/year 3/CSC493/emphatic-AI-Winter2025/ambiguity_model/cleaned_dataset.csv')
test_data = pd.read_csv('/Users/devshah/Documents/WorkSpace/University/year 3/CSC493/emphatic-AI-Winter2025/ambiguity_model/cleaned_dataset_test.csv')

# Remove any rows with NaN values
train_data = train_data.dropna(subset=['sentence', 'label'])
test_data = test_data.dropna(subset=['sentence', 'label'])

# Basic data exploration
print("Training dataset shape:", train_data.shape)
print("Test dataset shape:", test_data.shape)

# Check the distribution of labels
print("\nTraining label distribution:")
print(train_data['label'].value_counts())
print(train_data['label'].value_counts(normalize=True))

# Map labels to integers
label_dict = {'INNOCUOUS': 0, 'NOCUOUS': 1}
train_data['label_id'] = train_data['label'].map(label_dict)
test_data['label_id'] = test_data['label'].map(label_dict)

# Prepare features (X) and target (y)
X_train = train_data['sentence'].values
y_train = train_data['label_id'].values
X_test = test_data['sentence'].values
y_test = test_data['label_id'].values

print("\nTraining set size:", len(X_train))
print("Testing set size:", len(X_test))

# Rest of your code remains the same, starting from:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Your existing encode_sentences function and the rest of the code...

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize and encode sequences
def encode_sentences(texts, tokenizer, max_length=128):
    input_ids = []
    attention_masks = []
    
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    return input_ids, attention_masks

# Encode training and testing sets
train_input_ids, train_attention_masks = encode_sentences(X_train, tokenizer)
test_input_ids, test_attention_masks = encode_sentences(X_test, tokenizer)

# Convert targets to tensors - no need to modify the shape here
train_labels = torch.tensor(y_train, dtype=torch.long)
test_labels = torch.tensor(y_test, dtype=torch.long)

# Create data loaders
batch_size = 16

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(
    test_dataset,
    sampler=SequentialSampler(test_dataset),
    batch_size=batch_size
)

# Modified training function
def train_model(model, dataloader, optimizer, scheduler, device):
    total_loss = 0
    model.train()
    
    for step, batch in enumerate(dataloader):
        if step % 40 == 0 and step != 0:
            print(f'  Batch {step}  of  {len(dataloader)}')
        
        b_input_ids = batch[0].to(device)
        b_attention_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()
        
        # Forward pass - removing token_type_ids parameter
        outputs = model(
            input_ids=b_input_ids,
            attention_mask=b_attention_mask,
            labels=b_labels
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        loss.backward()
        
        # Clip the norm of the gradients to 1.0
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

# Modified evaluation function
def evaluate_model(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []
    
    for batch in dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_attention_mask, b_labels = batch
        
        with torch.no_grad():
            # Forward pass - removing token_type_ids parameter
            outputs = model(
                input_ids=b_input_ids,
                attention_mask=b_attention_mask
            )
        
        logits = outputs.logits
        
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        predictions.extend(np.argmax(logits, axis=1).flatten())
        true_labels.extend(label_ids.flatten())
    
    return predictions, true_labels

# Load pre-trained BERT model with correct configuration
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,  # Binary classification
    output_attentions=False,
    output_hidden_states=False,
)

model.to(device)

# Set up optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 20
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Train the model
print("Training BERT model...")
for epoch in range(epochs):
    print(f'======== Epoch {epoch + 1} / {epochs} ========')
    print('Training...')
    
    avg_train_loss = train_model(model, train_dataloader, optimizer, scheduler, device)
    print(f"  Average training loss: {avg_train_loss:.2f}")
    
    print("Running Validation...")
    predictions, true_labels = evaluate_model(model, test_dataloader, device)
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predictions)
    print(f"  Accuracy: {accuracy:.4f}")

# Final evaluation
print("\nEvaluating on test set...")
predictions, true_labels = evaluate_model(model, test_dataloader, device)

# Convert numeric predictions back to text labels
label_map = {v: k for k, v in label_dict.items()}
pred_labels = [label_map[pred] for pred in predictions]
true_labels_text = [label_map[label] for label in true_labels]

# Print metrics
print("\nAccuracy:", accuracy_score(true_labels, predictions))
print("\nClassification Report:")
print(classification_report(true_labels_text, pred_labels))


print("Model, tokenizer, and label dictionary saved successfully.")

Using device: cpu
Training dataset shape: (140, 4)
Test dataset shape: (74, 4)

Training label distribution:
label
NOCUOUS        73
INNOCUOUS      66
Detected as     1
Name: count, dtype: int64
label
NOCUOUS        0.521429
INNOCUOUS      0.471429
Detected as    0.007143
Name: proportion, dtype: float64

Training set size: 140
Testing set size: 74


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training BERT model...
Training...
  Average training loss: 0.68
Running Validation...
  Accuracy: 0.6081
Training...
  Average training loss: 0.64
Running Validation...
  Accuracy: 0.5946
Training...
  Average training loss: 0.57
Running Validation...
  Accuracy: 0.6351
Training...
  Average training loss: 0.48
Running Validation...
  Accuracy: 0.6081
Training...
  Average training loss: 0.39
Running Validation...
  Accuracy: 0.6216
Training...
  Average training loss: 0.31
Running Validation...
  Accuracy: 0.5946
Training...
  Average training loss: 0.20
Running Validation...
  Accuracy: 0.6216
Training...
  Average training loss: 0.12
Running Validation...
  Accuracy: 0.6216
Training...
  Average training loss: 0.07
Running Validation...
  Accuracy: 0.6216
Training...
  Average training loss: 0.06
Running Validation...
  Accuracy: 0.6351
Training...
  Average training loss: 0.05
Running Validation...
  Accuracy: 0.6081
Training...
  Average training loss: 0.05
Running Validation...
