In [1]:
import os
CUDA_VISIBLE_DEVICES = "0,1,2"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
from datasets import load_dataset 

# dataset = load_dataset("VincentPai/for-graphormer-new2")
dataset = load_dataset("VincentPai/encoded-MITRE-small")

# rename the label to y to fit the format of the input of the Graphormer
# dataset['train'] = dataset['train'].rename_column('label', 'y')

dataset = dataset.shuffle(seed = 87)

Downloading and preparing dataset json/VincentPai--encoded-MITRE-small to /workdir/home/euni/.cache/huggingface/datasets/VincentPai___json/VincentPai--encoded-MITRE-small-9efe53e78820692b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.70M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /workdir/home/euni/.cache/huggingface/datasets/VincentPai___json/VincentPai--encoded-MITRE-small-9efe53e78820692b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
from datasets import load_metric
metric = load_metric("accuracy")

  metric = load_metric("accuracy")


In [5]:
from transformers.models.graphormer.collating_graphormer import preprocess_item, GraphormerDataCollator
dataset_processed = dataset.map(preprocess_item, batched=False)

Map:   0%|          | 0/29595 [00:00<?, ? examples/s]

Map:   0%|          | 0/9865 [00:00<?, ? examples/s]

Map:   0%|          | 0/9866 [00:00<?, ? examples/s]

In [6]:
dataset_processed

DatasetDict({
    train: Dataset({
        features: ['y', 'num_nodes', 'node_feat', 'edge_attr', 'edge_index', 'input_nodes', 'attn_bias', 'attn_edge_type', 'spatial_pos', 'in_degree', 'out_degree', 'input_edges', 'labels'],
        num_rows: 29595
    })
    validation: Dataset({
        features: ['y', 'num_nodes', 'node_feat', 'edge_attr', 'edge_index', 'input_nodes', 'attn_bias', 'attn_edge_type', 'spatial_pos', 'in_degree', 'out_degree', 'input_edges', 'labels'],
        num_rows: 9865
    })
    test: Dataset({
        features: ['y', 'num_nodes', 'node_feat', 'edge_attr', 'edge_index', 'input_nodes', 'attn_bias', 'attn_edge_type', 'spatial_pos', 'in_degree', 'out_degree', 'input_edges', 'labels'],
        num_rows: 9866
    })
})

In [7]:
# split up training into training + validation

train_ds = dataset_processed['train']
val_ds = dataset_processed['validation']

### Training the model


Calling the `from_pretrained` method on our model downloads and caches the weights for us. As the number of classes (for prediction) is dataset dependent, we pass the new `num_classes` as well as `ignore_mismatched_sizes` alongside the `model_checkpoint`. This makes sure a custom classification head is created, specific to our task, hence likely different from the original decoder head. 

(When using a pretrained model, you must make sure the embeddings of your data have the same shape as the ones used to pretrain your model.)

In [8]:
from transformers import TrainerCallback

class PrintInfoCallback(TrainerCallback):
    def on_log(self, args, state, control, model, logs=None, **kwargs):
        print(f"Epoch: {state.epoch}, Step: {state.global_step}, Loss: {logs['loss']}")


In [9]:
from sklearn.metrics import accuracy_score
from transformers import EvalPrediction

def compute_accuracy(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    return {"accuracy": accuracy_score(p.label_ids, preds)}

In [10]:
from transformers import GraphormerForGraphClassification

model_checkpoint = "clefourrier/graphormer-base-pcqm4mv2" # pre-trained model from which to fine-tune

model = GraphormerForGraphClassification.from_pretrained(
    model_checkpoint, 

    # We have 167 attack patterns and 1 benign
    num_classes=168, 

    ignore_mismatched_sizes = True, # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

Some weights of GraphormerForGraphClassification were not initialized from the model checkpoint at clefourrier/graphormer-base-pcqm4mv2 and are newly initialized because the shapes did not match:
- classifier.classifier.weight: found shape torch.Size([1, 768]) in the checkpoint and torch.Size([168, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from tqdm import tqdm
from transformers import TrainingArguments

training_args = TrainingArguments(
    "graph-classification",
    logging_dir="graph-classification",
    
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,

    auto_find_batch_size=True, # batch size can be changed automatically to prevent OOMs
    gradient_accumulation_steps=10,
    dataloader_num_workers=4, 

    num_train_epochs=5,

    evaluation_strategy="epoch",
    logging_strategy="epoch",
    push_to_hub=False,

    disable_tqdm=False,  # show the tqdm bar
)

In [12]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

# 你需要定义一些参数
num_training_steps = len(train_ds) * training_args.num_train_epochs // training_args.per_device_train_batch_size // training_args.gradient_accumulation_steps
num_warmup_steps = num_training_steps // 10

optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)

In [13]:
from transformers import Trainer


# 在创建 Trainer 实例时传入优化器和学习率调度器
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=GraphormerDataCollator(),
    callbacks=[PrintInfoCallback()],
    compute_metrics=compute_accuracy,
    optimizers=(optimizer, scheduler),
)


We can now train our model!

In [14]:
train_results = trainer.train()

# rest is optional but nice to have
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

You can now upload the result of the training to the Hub with the following:
- Need to login first(add some code in the front of the script)

In [None]:
trainer.push_to_hub()