# Assignment 4 Ransomware Classification Pipeline

This notebook reproduces the end-to-end workflow delivered in the Assignment 4 coding tasks. Run the cells sequentially to fine-tune the transformer models, evaluate them on the UGR and PM datasets, and generate the diagnostic artefacts (metrics, plots, SHAP/LIME reports).

## 1. Environment preparation

Uncomment the installation command below if you are running in a fresh environment that does not already have the required dependencies. The command expects to be executed from the repository root so that the relative path to `requirements.txt` resolves correctly.

In [None]:
# !pip install -r ../requirements.txt

## 2. Configure paths and imports

The cell below detects the project root, adds the `src` directory to `sys.path`, and imports the helpers that power the assignment pipeline.

In [None]:
from __future__ import annotations

import json
import random
from dataclasses import replace
from pathlib import Path
import sys

import pandas as pd

PROJECT_ROOT = Path.cwd().resolve()
if not (PROJECT_ROOT / 'src').exists():
    # When the notebook is opened from the `notebooks/` directory, move one level up.
    PROJECT_ROOT = PROJECT_ROOT.parent
SRC_DIR = PROJECT_ROOT / 'src'
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

print(f'Project root: {PROJECT_ROOT}')
print(f'Source directory added to sys.path: {SRC_DIR}')

from assignment4.config import AssignmentConfig, DEFAULT_MODELS
from assignment4.data_utils import (
    build_datasets,
    load_dataframe,
    save_split_metadata,
    stratified_split,
)
from assignment4.explainability import generate_lime_reports, generate_shap_plots
from assignment4.modeling import load_model_components
from assignment4.training import train_model
from assignment4.visualisation import plot_attention_heatmap, plot_training_history

print('Imports complete.')

## 3. Helper utilities

Some small helpers mirror the orchestration script so the notebook can orchestrate the workflow.

In [None]:
def select_models(model_names, num_epochs=None):
    """Pick the configured transformer models and optionally override epochs."""
    selected = []
    for cfg in DEFAULT_MODELS:
        if cfg.name in {name.lower() for name in model_names}:
            selected_cfg = cfg
            if num_epochs is not None and cfg.num_train_epochs != num_epochs:
                selected_cfg = replace(cfg, num_train_epochs=num_epochs)
            selected.append(selected_cfg)
    if not selected:
        raise ValueError(f'No matching models found for {model_names!r}')
    return selected


def sample_texts(texts, k, seed):
    """Return up to *k* texts sampled deterministically using the provided seed."""
    if k is None or k <= 0:
        return list(texts)
    rng = random.Random(seed)
    texts = list(texts)
    if len(texts) <= k:
        return texts
    return rng.sample(texts, k)


def display_summary(metrics):
    """Render a pandas DataFrame with the collected evaluation metrics."""
    if not metrics:
        return pd.DataFrame()
    df = pd.DataFrame(metrics).T
    df.index.name = 'dataset_model'
    return df


## 4. Configure the experiment

Adjust the parameters in the next cell to control which models run, how many samples are used, and whether explainability artefacts are produced.

In [None]:
DATASETS = {
    'ugr': PROJECT_ROOT / 'UGR_text.csv',
    'pm': PROJECT_ROOT / 'PM_text.csv',
}

OUTPUT_DIR = PROJECT_ROOT / 'notebook_artifacts'
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_SEED = 42
MAX_SAMPLES = None  # e.g. set to 500 for a quicker dry run
SELECTED_MODELS = ['bert', 'roberta', 'deberta']
EPOCH_OVERRIDE = 1  # reduce epochs to keep runtime manageable in a notebook
GENERATE_ATTENTION = True
GENERATE_EXPLAINABILITY = True  # toggle off if SHAP/LIME should be skipped

assignment_cfg = AssignmentConfig(random_seed=RANDOM_SEED, output_dir=str(OUTPUT_DIR))
assignment_cfg.shap_sample_size = min(assignment_cfg.shap_sample_size, 10)
assignment_cfg.lime_sample_size = min(assignment_cfg.lime_sample_size, 5)
assignment_cfg.attention_plot_examples = min(assignment_cfg.attention_plot_examples, 3)
assignment_cfg.model_cache_dir = str(PROJECT_ROOT / 'model_cache')

model_configs = select_models(SELECTED_MODELS, num_epochs=EPOCH_OVERRIDE)
print('Configured models:', [cfg.name for cfg in model_configs])
print('Artifacts will be written to:', OUTPUT_DIR)

## 5. Run the training and evaluation loop

Executing the next cell performs the complete workflow for each dataset/model combination.

In [None]:
summary_metrics = {}

for dataset_name, csv_path in DATASETS.items():
    print(f'\n=== Processing dataset: {dataset_name.upper()} ===')
    df = load_dataframe(csv_path)
    if MAX_SAMPLES is not None:
        df = df.sample(n=min(MAX_SAMPLES, len(df)), random_state=RANDOM_SEED).reset_index(drop=True)
    train_df, val_df, test_df = stratified_split(df, assignment_cfg)
    save_split_metadata(OUTPUT_DIR, dataset_name, train_df, val_df, test_df, assignment_cfg)

    for model_cfg in model_configs:
        print(f'
Training model: {model_cfg.name}')
        components = load_model_components(
            model_cfg,
            num_labels=len(assignment_cfg.label_names),
            cache_dir=assignment_cfg.model_cache_dir,
        )
        train_ds, val_ds, test_ds = build_datasets(
            components.tokenizer,
            train_df,
            val_df,
            test_df,
            model_cfg,
        )
        training_result = train_model(
            components,
            train_ds,
            val_ds,
            test_ds,
            dataset_name,
            model_cfg,
            assignment_cfg,
            OUTPUT_DIR,
        )
        summary_key = f'{dataset_name}_{model_cfg.name}'
        summary_metrics[summary_key] = training_result.eval_metrics
        plot_training_history(training_result.train_history, OUTPUT_DIR, dataset_name, model_cfg.name)

        if GENERATE_ATTENTION:
            attention_samples = sample_texts(test_df['text'].tolist(), assignment_cfg.attention_plot_examples, RANDOM_SEED)
            plot_attention_heatmap(
                training_result.trainer.model,
                components.tokenizer,
                attention_samples,
                OUTPUT_DIR,
                dataset_name,
                model_cfg,
                max_examples=assignment_cfg.attention_plot_examples,
            )

        if GENERATE_EXPLAINABILITY:
            shap_texts = sample_texts(test_df['text'].tolist(), assignment_cfg.shap_sample_size, RANDOM_SEED)
            generate_shap_plots(
                training_result.trainer.model,
                components.tokenizer,
                shap_texts,
                OUTPUT_DIR,
                dataset_name,
                model_cfg,
                [assignment_cfg.label_names[i] for i in sorted(assignment_cfg.label_names)],
            )

            lime_texts = sample_texts(test_df['text'].tolist(), assignment_cfg.lime_sample_size, RANDOM_SEED)
            generate_lime_reports(
                training_result.trainer.model,
                components.tokenizer,
                lime_texts,
                OUTPUT_DIR,
                dataset_name,
                model_cfg,
                [assignment_cfg.label_names[i] for i in sorted(assignment_cfg.label_names)],
            )

summary_path = OUTPUT_DIR / 'summary_metrics.json'
summary_path.write_text(json.dumps(summary_metrics, indent=2))
print('
Saved summary metrics to', summary_path)
summary_metrics

## 6. Inspect evaluation metrics

Convert the aggregated metrics into a table for convenient inspection.

In [None]:
metrics_df = display_summary(summary_metrics)
metrics_df

## 7. Explore generated artefacts

All artefacts (metrics, plots, SHAP/LIME outputs) are written to the directory configured above.

In [None]:
sorted(path.name for path in OUTPUT_DIR.glob('*'))