In [None]:
# Install necessary packages
!pip install transformers torch scikit-learn

import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split, accuracy_score
from torch.nn import BCEWithLogitsLoss
from torch.optim import AdamW
from tqdm import tqdm

# Custom dataset class for the incident descriptions
class IncidentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.float)
        }

# Load the tokenizer and SciBERT model
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

max_len = 256  # Adjust based on your text length

# Split dataset into train and test sets
X = df['Processed_Description'].values
y = df['MI_Incident'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

train_dataset = IncidentDataset(X_train, y_train, tokenizer, max_len)
test_dataset = IncidentDataset(X_test, y_test, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define a classifier based on SciBERT
class SciBERTClassifier(torch.nn.Module):
    def __init__(self, bert_model, num_labels=1):
        super(SciBERTClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_labels)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # CLS token output
        cls_output = self.dropout(cls_output)
        logits = self.classifier(cls_output)
        return logits

model = SciBERTClassifier(model).to(device)

# Use weighted BinaryCrossEntropy for imbalanced classes
class_weights = torch.tensor([1.0, 5.0]).to(device)  # Adjust the weights based on class imbalance
criterion = BCEWithLogitsLoss(pos_weight=class_weights)

optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).unsqueeze(1)

        logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.4f}')

# Define the evaluation function for unique misclassification checking
def evaluate(loader, set_name="Test"):
    y_preds = []
    y_probs = []
    y_true = []
    misclassified = {}
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device).unsqueeze(1)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = torch.sigmoid(logits).cpu().numpy()  # Apply sigmoid to get probabilities
            preds = (probs > 0.5).astype(int)  # Threshold at 0.5 for binary classification
            
            y_probs.extend(probs.flatten())
            y_preds.extend(preds.flatten())
            y_true.extend(labels.cpu().numpy().flatten())

            # Identify misclassified samples uniquely
            texts = batch['text']  # Extract text data from batch
            for i, (text, pred, true) in enumerate(zip(texts, preds.flatten(), labels.cpu().numpy().flatten())):
                if pred != true:
                    # Use text as a unique key to prevent duplicates
                    misclassified[text] = (pred, true, probs[i][0])

    accuracy = accuracy_score(y_true, y_preds)
    print(f'{set_name} Accuracy: {accuracy * 100:.2f}%')

    # Print unique misclassified samples
    if misclassified:
        print(f"\nUnique Misclassified Samples in {set_name} Set:")
        for text, (pred, true, prob) in misclassified.items():
            print(f"Text: {text}\nPredicted Label: {pred}, True Label: {true}, Probability: {prob:.4f}\n")

# Evaluate on test set
evaluate(test_loader, set_name="Test")

# Evaluate on training set
evaluate(train_loader, set_name="Train")

# Save the trained model and tokenizer
model.bert.save_pretrained('scibert_cls_model')
tokenizer.save_pretrained('scibert_cls_tokenizer')
