In [0]:
# Install necessary libraries
%pip install transformers openai torch tqdm numpy datasets accelerate matplotlib wandb

In [0]:
# Imports
import logging
logging.basicConfig(level='ERROR')
import numpy as np
from pathlib import Path
# import openai
import torch
import zlib
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, AutoModel
from tqdm import tqdm
import numpy as np
from datasets import load_dataset
from eval import *
from experiment_utils import *
from model_utils import *
from unlearning import *
from accelerate import init_empty_weights, infer_auto_device_map

In [0]:
model = 'EleutherAI/pythia-2.8b'
output_dir = f"experiment_output/{model}"
if not os.path.exists(output_dir):
    os.makedirs(output_dir, exist_ok=True)

experiment_args = ExperimentArgs(
        model = model,
        output_dir = output_dir,
        # Directory in which to store models locally (to prevent having to download for each experiment)
        model_dir_prefix = "/dbfs/mnt/ds-data-apps/maris/base_models_dupe/",
        input_name = 'input',
        label_name = 'label',
        data = 'swj0419/WikiMIA',
        data_name = 'WikiMIA',
        length = 128
    )

unlearning_args = UnlearningArgs(
        lr=3e-6,
        steps = 4,
        batch_size = 16,
        include_learning = False,
        metric = 'All', # Choose from PPL, Min_K, Min_K++ or All
        num_repeats = 1
    )

Path(output_dir).mkdir(parents=True, exist_ok=True)

In [0]:
lrs = [3e-6]
steps = [3]
batch_sizes = [16]
experiment2 = Experiment(experiment_args, unlearning_args)
experiment2.run_gridsearch(lrs,steps,batch_sizes)

In [0]:
import json
import matplotlib.pyplot as plt
graph_dir = output_dir + '/graphs/'
if not os.path.exists(graph_dir):
  os.makedirs(graph_dir)

i = 1
# Load JSON data from the file
with open(f'{output_dir}/{experiment_args.data_name}_length{experiment_args.length}_grid_search_{i}.json', 'r') as file:
    data = json.load(file)

metrics = [metric for metric in data[0].keys() if metric != 'params']

# Extract steps and corresponding MIM_AUC values
steps = [entry['params']['steps'] for entry in data]

all_metric_means = {metric: [entry[metric]['mean'] for entry in data] for metric in metrics}
all_metric_stds = {metric: [entry[metric]['std'] for entry in data] for metric in metrics}

# Plot the data
plt.figure(figsize=(10, 6))
for metric, result in all_metric_means.items():
    plt.errorbar(steps, result, yerr=all_metric_stds[metric], label=metric, marker='o')

# Add titles and labels
plt.title('Gradient ascent-based membership inference')
plt.xlabel('Steps')
plt.ylabel('MIM AUC')
plt.legend()

# Show the plot
plt.grid(True)
# plt.savefig(graph_dir+f'metrics_vs_ULsteps_grid_search_{i}.png')
plt.savefig(graph_dir+f'{experiment_args.data_name}_length{experiment_args.length}_{unlearning_args.metric}.png')

In [0]:
# sentence = "The quick brown fox jumped over the lazy dog"
# sentence_encode = tokenizer.encode(sentence)
# PPL = calculatePerplexity(sentence_encode, base_model, tokenizer, unlearning_args.device)
# print(f"PPL: {PPL}")