In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, pipeline
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1

# Define the maximum sequence length
MAX_SEQ_LENGTH = 128

# Load NER model for preprocessing
ner_tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
ner_model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
ner_pipeline = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, device=device)

# NER preprocessing function
def extract_entities_and_anonymize(text):
    ner_results = ner_pipeline(text)
    extracted_entities = []

    # Replace entities with their labels
    for result in ner_results:
        start, end = result['start'], result['end']
        entity = result['word']
        label = result['entity']
        extracted_entities.append(entity)
        text = text[:start] + f"[{label}] " + text[end:]
    
    return text, extracted_entities

# Function to process a single text file and return sequences with labels for a specific tokenizer
def process_file(file_path, label, tokenizer):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Apply NER anonymization
    text, _ = extract_entities_and_anonymize(text)

    # Tokenize the entire text
    tokens = tokenizer(text, add_special_tokens=False)
    input_ids = tokens['input_ids']

    # Split the input IDs into chunks of MAX_SEQ_LENGTH
    chunks = [input_ids[i:i + MAX_SEQ_LENGTH] for i in range(0, len(input_ids), MAX_SEQ_LENGTH)]

    # Attach label and file name to each sequence
    labeled_sequences = [(chunk, label, file_path) for chunk in chunks]
    
    return labeled_sequences

# Function to process multiple files for a specific category
def process_category(file_paths, label, tokenizer):
    all_sequences = []
    file_sequence_counts = {}
    
    for file_path in file_paths:
        labeled_sequences = process_file(file_path, label, tokenizer)
        file_sequence_counts[file_path] = len(labeled_sequences)
        all_sequences.extend(labeled_sequences)
    
    return all_sequences, file_sequence_counts

# Function to convert sequences into a DataFrame
def sequences_to_dataframe(sequences, tokenizer):
    texts = [tokenizer.decode(seq[0]) for seq in sequences]
    labels = [seq[1] for seq in sequences]
    return pd.DataFrame({'content': texts, 'category': labels})

# Specify file paths for Train, Test, and Valid sets
test_files_0 = ["./data/train/0_The_Hound_of_the_Baskervilles_CD.txt"]
test_files_1 = ["./data/test/1_POIROT_INVESTIGATES.txt"]

valid_files_0 = ["./data/train/0_The_Man_Who_Was_Thursday_GKC.txt"]
valid_files_1 = ["./data/train/1_THE_BIG_FOUR.txt"]

train_files_0 = [
    "./data/train/0_The_Mystery_of_the_Yellow_Room_GL.txt",
    "./data/train/0_The_Middle_Temple_Murder_JF.txt",
    "./data/train/0_JOHN_THORNDYKE'S_CASES_RF.txt",
    "./data/train/0_A_Study_in_Scarlet_CD.txt",
    "./data/train/0_Tremendous_Trifles_GKC.txt",
    "./data/test/0_Caught_in_the_Net_EG.txt",
    "./data/valid/0_The_Red_House_Mystery_AM.txt"
]

train_files_1 = [
    "./data/train/1_The_Secret_of_Chimneys.txt",
    "./data/train/1_The_Mystery_of_the_Blue_Train.txt",
    "./data/train/1_The_Mysterious_Affair_at_Styles.txt",
    "./data/train/1_The_Murder_on_the_Links.txt",
    "./data/train/1_The_Murder_of_Roger_Ackroyd.txt",
    "./data/train/1_THE_MAN_IN_THE_BROWN_SUIT.txt",
    "./data/valid/1_THE_SECRET_ADVERSARY.txt"
]

# Tokenizers for each model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
xlm_tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

# Prepare data for each model
def prepare_data(tokenizer):
    test_sequences = process_category(test_files_0, label=0, tokenizer=tokenizer)[0] + \
                     process_category(test_files_1, label=1, tokenizer=tokenizer)[0]
    valid_sequences = process_category(valid_files_0, label=0, tokenizer=tokenizer)[0] + \
                      process_category(valid_files_1, label=1, tokenizer=tokenizer)[0]
    train_sequences = process_category(train_files_0, label=0, tokenizer=tokenizer)[0] + \
                      process_category(train_files_1, label=1, tokenizer=tokenizer)[0]

    train_data = sequences_to_dataframe(train_sequences, tokenizer)
    valid_data = sequences_to_dataframe(valid_sequences, tokenizer)
    test_data = sequences_to_dataframe(test_sequences, tokenizer)

    return train_data, valid_data, test_data

# Generate datasets for each model
train_data_bert, valid_data_bert, test_data_bert = prepare_data(bert_tokenizer)
train_data_distilbert, valid_data_distilbert, test_data_distilbert = prepare_data(distilbert_tokenizer)
train_data_xlm, valid_data_xlm, test_data_xlm = prepare_data(xlm_tokenizer)

# Tokenization
def tokenize_texts(texts, tokenizer, max_length=MAX_SEQ_LENGTH):
    return tokenizer(
        list(texts),
        max_length=max_length,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )

# Tokenize for BERT
bert_train_encodings = tokenize_texts(train_data_bert['content'], bert_tokenizer)
bert_val_encodings = tokenize_texts(valid_data_bert['content'], bert_tokenizer)
bert_test_encodings = tokenize_texts(test_data_bert['content'], bert_tokenizer)

# Tokenize for DistilBERT
distilbert_train_encodings = tokenize_texts(train_data_distilbert['content'], distilbert_tokenizer)
distilbert_val_encodings = tokenize_texts(valid_data_distilbert['content'], distilbert_tokenizer)
distilbert_test_encodings = tokenize_texts(test_data_distilbert['content'], distilbert_tokenizer)

# Tokenize for XLM-RoBERTa
xlm_train_encodings = tokenize_texts(train_data_xlm['content'], xlm_tokenizer)
xlm_val_encodings = tokenize_texts(valid_data_xlm['content'], xlm_tokenizer)
xlm_test_encodings = tokenize_texts(test_data_xlm['content'], xlm_tokenizer)

# Prepare Dataset for PyTorch
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

# Create PyTorch datasets for BERT
bert_train_dataset = CustomDataset(bert_train_encodings, train_data_bert['category'])
bert_val_dataset = CustomDataset(bert_val_encodings, valid_data_bert['category'])
bert_test_dataset = CustomDataset(bert_test_encodings, test_data_bert['category'])

# Create PyTorch datasets for DistilBERT
distilbert_train_dataset = CustomDataset(distilbert_train_encodings, train_data_distilbert['category'])
distilbert_val_dataset = CustomDataset(distilbert_val_encodings, valid_data_distilbert['category'])
distilbert_test_dataset = CustomDataset(distilbert_test_encodings, test_data_distilbert['category'])

# Create PyTorch datasets for XLM-RoBERTa
xlm_train_dataset = CustomDataset(xlm_train_encodings, train_data_xlm['category'])
xlm_val_dataset = CustomDataset(xlm_val_encodings, valid_data_xlm['category'])
xlm_test_dataset = CustomDataset(xlm_test_encodings, test_data_xlm['category'])

# Prepare Dataset for PyTorch
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

# Create PyTorch datasets for BERT
bert_train_dataset = CustomDataset(bert_train_encodings, train_data_bert['category'])
bert_val_dataset = CustomDataset(bert_val_encodings, valid_data_bert['category'])
bert_test_dataset = CustomDataset(bert_test_encodings, test_data_bert['category'])

# Create PyTorch datasets for DistilBERT
distilbert_train_dataset = CustomDataset(distilbert_train_encodings, train_data_distilbert['category'])
distilbert_val_dataset = CustomDataset(distilbert_val_encodings, valid_data_distilbert['category'])
distilbert_test_dataset = CustomDataset(distilbert_test_encodings, test_data_distilbert['category'])

# Create PyTorch datasets for XLM-RoBERTa
xlm_train_dataset = CustomDataset(xlm_train_encodings, train_data_xlm['category'])
xlm_val_dataset = CustomDataset(xlm_val_encodings, valid_data_xlm['category'])
xlm_test_dataset = CustomDataset(xlm_test_encodings, test_data_xlm['category'])

# Plot metrics (loss and accuracy)
def plot_metrics(train_losses, eval_losses, eval_accuracies, model_name, fold):
    epochs = range(1, len(eval_losses) + 1)
    plt.figure(figsize=(12, 6))

    # Plot loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label='Training Loss')
    plt.plot(epochs, eval_losses, label='Validation Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title(f'{model_name} Loss Per Epoch (Fold {fold + 1})')
    plt.legend()

    # Plot accuracy
    plt.subplot(1, 2, 2)
    plt.plot(epochs, eval_accuracies, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title(f'{model_name} Validation Accuracy Per Epoch (Fold {fold + 1})')
    plt.legend()

    plt.tight_layout()
    plt.show()

def cross_validate(model_class, model_name_or_path, tokenizer, train_data, num_folds=3, model_name="Model"):
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    texts, labels = train_data['content'], train_data['category']
    fold_metrics = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
        print(f"\nFold {fold + 1}/{num_folds}")

        # Prepare fold datasets
        train_texts, val_texts = texts.iloc[train_idx], texts.iloc[val_idx]
        train_labels, val_labels = labels.iloc[train_idx], labels.iloc[val_idx]
        train_encodings = tokenize_texts(train_texts, tokenizer)
        val_encodings = tokenize_texts(val_texts, tokenizer)
        train_dataset = CustomDataset(train_encodings, train_labels)
        val_dataset = CustomDataset(val_encodings, val_labels)

        # Initialize model
        model = model_class.from_pretrained(model_name_or_path, num_labels=2)

        # TrainingArguments
        training_args = TrainingArguments(
            output_dir=f'./results_{model_name}_fold{fold}',
            evaluation_strategy='epoch',
            save_strategy='epoch',
            logging_strategy='epoch',
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            num_train_epochs=4,
            weight_decay=0.01,
            logging_dir=f'./logs_{model_name}_fold{fold}',
            load_best_model_at_end=True,
            report_to="none"
        )

        # Metrics
        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=1)
            precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
            acc = accuracy_score(labels, predictions)
            return {'eval_accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics
        )

        # Train the model
        trainer.train()

        # Extract metrics
        train_log = trainer.state.log_history
        epoch_logs = [log for log in train_log if 'epoch' in log.keys()]
        train_losses = [log['loss'] for log in epoch_logs if 'loss' in log]
        eval_losses = [log['eval_loss'] for log in epoch_logs if 'eval_loss' in log]
        eval_accuracies = [log['eval_accuracy'] for log in epoch_logs if 'eval_accuracy' in log]

        # Plot metrics for this fold
        plot_metrics(train_losses, eval_losses, eval_accuracies, model_name, fold)

        # Evaluate the model on validation set
        eval_results = trainer.evaluate(val_dataset)
        fold_metrics.append(eval_results)
        print(f"Metrics for Fold {fold + 1}: {eval_results}")

    avg_metrics = {key: np.mean([fold[key] for fold in fold_metrics]) for key in fold_metrics[0]}
    print(f"\nAverage Metrics Across {num_folds} Folds: {avg_metrics}")
    return avg_metrics

In [None]:
# Cross-validate for BERT
avg_metrics_bert = cross_validate(
    model_class=BertForSequenceClassification,
    model_name_or_path='bert-base-uncased',
    tokenizer=bert_tokenizer,
    train_data=train_data_bert,
    num_folds=3,
    model_name="BERT"
)

In [None]:
# Cross-validate for DistilBERT
avg_metrics_distilbert = cross_validate(
    model_class=DistilBertForSequenceClassification,
    model_name_or_path='distilbert-base-cased',
    tokenizer=distilbert_tokenizer,
    train_data=train_data_distilbert,
    num_folds=3,
    model_name="DistilBERT"
)

In [None]:
# Cross-validate for XLM-RoBERTa
avg_metrics_xlm = cross_validate(
    model_class=AutoModelForSequenceClassification,
    model_name_or_path='xlm-roberta-base',
    tokenizer=xlm_tokenizer,
    train_data=train_data_xlm,
    num_folds=3,
    model_name="XLM-RoBERTa"
)