In [1]:
import torch
import torch.nn as nn
import transformers
import numpy as np
from copy import deepcopy

In [71]:
models = ["gpt2-xl", "EleutherAI/gpt-neo-2.7B", "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neox-20b"]
dataset = ["xsum", "squad", "writing"] # squad needs dataset key "context"

template = """#!/bin/bash
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4
#SBATCH --job-name={}-{}-bl
#SBATCH --output=/scratch/bc3088/nlu/detect-gpt/log/%j_%x.out
#SBATCH --error=/scratch/bc3088/nlu/detect-gpt/log/%j_%x.err
#SBATCH --time=12:00:00
#SBATCH --gres=gpu:1
#SBATCH --mem=60G
#SBATCH --requeue

#SBATCH --mail-type=ALL
#SBATCH --mail-user=bale.chen@nyu.edu

module purge

singularity exec --nv --bind /scratch/bc3088/ --overlay /scratch/bc3088/nlu/detect-gpt/overlay-25GB-500K.ext3:ro /scratch/work/public/singularity/cuda11.4.2-cudnn8.2.4-devel-ubuntu20.04.3.sif /bin/bash -c "
source /ext3/env.sh;
conda activate; 
{}\"
"""

for m in models:
    for d in dataset:
        s = f"python run-small.py --dataset {d} --n_samples 500 --n_perturbation_list 50 --base_model_name {m} --mask_filling_model_name t5-3b --perturb_method detectgpt --pct_words_masked 0.3 --span_length 2 --batch_size 4 --half"
        if d == "squad":
            s += " --dataset_key context"
        m_ = m.replace("/", "_")
        fname = f"baseline-slurms/{m_}-{d}-baseline.sh"
        with open(fname, "w") as f:
            f.write(template.format(m_, d, s))

In [72]:
dataset = ["squad", "writing"]
models = ["EleutherAI/gpt-neo-2.7B", "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neox-20b"]
for m in models:
    for d in dataset:
        m = m.replace("/", "_")
        fname = f"baseline-slurms/{m}-{d}-baseline.sh"
        print("sbatch", fname)

sbatch baseline-slurms/EleutherAI_gpt-neo-2.7B-squad-baseline.sh
sbatch baseline-slurms/EleutherAI_gpt-neo-2.7B-writing-baseline.sh
sbatch baseline-slurms/EleutherAI_gpt-j-6B-squad-baseline.sh
sbatch baseline-slurms/EleutherAI_gpt-j-6B-writing-baseline.sh
sbatch baseline-slurms/EleutherAI_gpt-neox-20b-squad-baseline.sh
sbatch baseline-slurms/EleutherAI_gpt-neox-20b-writing-baseline.sh


In [None]:
python run-small.py 
--dataset xsum 
--n_samples 500 
--n_perturbation_list 50
--base_model_name gpt2-xl
--perturb_method noise_embed 
--noisy_level 0.3 
--skip_baselines 
--batch_size 16 
--pct_words_masked 0.3 
--span_length 2 
--half

In [5]:
models = ["gpt2-xl", "EleutherAI/gpt-neo-2.7B", "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neox-20b"]
dataset = ["xsum", "squad", "writing"] # squad needs dataset key "context"

template = """#!/bin/bash
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4
#SBATCH --job-name={}-{}-cross
#SBATCH --output=/scratch/bc3088/nlu/detect-gpt/log/%j_%x.out
#SBATCH --error=/scratch/bc3088/nlu/detect-gpt/log/%j_%x.err
#SBATCH --time=32:00:00
#SBATCH --gres=gpu:rtx8000:1
#SBATCH --mem=80G
#SBATCH --requeue

#SBATCH --mail-type=ALL
#SBATCH --mail-user=bale.chen@nyu.edu

module purge

singularity exec --nv --bind /scratch/bc3088/ --overlay /scratch/bc3088/nlu/detect-gpt/overlay-25GB-500K.ext3:ro /scratch/work/public/singularity/cuda11.4.2-cudnn8.2.4-devel-ubuntu20.04.3.sif /bin/bash -c "
source /ext3/env.sh;
conda activate; 
{}\"
"""

for m in models:
    ensemble = []
    for m2 in models:
        if m2 != m:
            ensemble.append(m2)
    ensemble = ",".join(ensemble)
    for d in dataset:
        s = f"python run_v2.py --dataset {d} --n_samples 200 --n_perturbation_list 50 --base_model_name {m} --mask_filling_model_name t5-3b --ensemble_scoring {ensemble} --skip_baselines --pct_words_masked 0.3 --span_length 2 --batch_size 4 --half"
        if d == "squad":
            s += " --dataset_key context"
        m_ = m.replace("/", "_")
        fname = f"cross-slurms/{m_}-{d}-cross.sh"
        with open(fname, "w") as f:
            f.write(template.format(m_, d, s))

In [3]:
dataset = ["xsum", "squad", "writing"]
models = ["gpt2-xl", "EleutherAI/gpt-neo-2.7B", "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neox-20b"]
for m in models:
    for d in dataset:
        m = m.replace("/", "_")
        fname = f"cross-slurms/{m}-{d}-cross.sh"
        print("sbatch", fname)

sbatch cross-slurms/gpt2-xl-xsum-cross.sh
sbatch cross-slurms/gpt2-xl-squad-cross.sh
sbatch cross-slurms/gpt2-xl-writing-cross.sh
sbatch cross-slurms/EleutherAI_gpt-neo-2.7B-xsum-cross.sh
sbatch cross-slurms/EleutherAI_gpt-neo-2.7B-squad-cross.sh
sbatch cross-slurms/EleutherAI_gpt-neo-2.7B-writing-cross.sh
sbatch cross-slurms/EleutherAI_gpt-j-6B-xsum-cross.sh
sbatch cross-slurms/EleutherAI_gpt-j-6B-squad-cross.sh
sbatch cross-slurms/EleutherAI_gpt-j-6B-writing-cross.sh
sbatch cross-slurms/EleutherAI_gpt-neox-20b-xsum-cross.sh
sbatch cross-slurms/EleutherAI_gpt-neox-20b-squad-cross.sh
sbatch cross-slurms/EleutherAI_gpt-neox-20b-writing-cross.sh


In [6]:
dataset = ["squad"]
models = ["gpt2-xl", "EleutherAI/gpt-neo-2.7B", "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neox-20b"]
for d in dataset:
    for m in models:
        print("sbatch", "generate_data.sh", m)

sbatch generate_data.sh gpt2-xl
sbatch generate_data.sh EleutherAI/gpt-neo-2.7B
sbatch generate_data.sh EleutherAI/gpt-j-6B
sbatch generate_data.sh EleutherAI/gpt-neox-20b


In [8]:
models = ["gpt2-xl", "EleutherAI/gpt-neo-2.7B", "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neox-20b"]
model_short = ["gpt2", "gpt-neo", "gpt-j", "neox"]
for d in dataset:
    for m, ms in zip(models, model_short):
        print("sbatch -J", f"opt-ft-{ms}", "run_opt_ft.sh", m)

sbatch -J opt-ft-gpt2 run_opt_ft.sh gpt2-xl
sbatch -J opt-ft-gpt-neo run_opt_ft.sh EleutherAI/gpt-neo-2.7B
sbatch -J opt-ft-gpt-j run_opt_ft.sh EleutherAI/gpt-j-6B
sbatch -J opt-ft-neox run_opt_ft.sh EleutherAI/gpt-neox-20b


## OPT Scripts

In [9]:
templete = """#!/bin/bash
#SBATCH --nodes=1
#SBATCH --cpus-per-task=4
#SBATCH --job-name=fo-{}-{}
#SBATCH --output=/scratch/bc3088/nlu/detect-gpt/log/%j_%x.out
#SBATCH --error=/scratch/bc3088/nlu/detect-gpt/log/%j_%x.err
#SBATCH --time=5:00:00
#SBATCH --gres=gpu:1
#SBATCH --mem=50G
#SBATCH --requeue

#SBATCH --mail-type=ALL
#SBATCH --mail-user=bc3088@nyu.edu

module purge

singularity exec --nv --bind /scratch/bc3088/ --overlay /scratch/bc3088/nlu/detect-gpt/overlay-25GB-500K.ext3:ro /scratch/work/public/singularity/cuda11.4.2-cudnn8.2.4-devel-ubuntu20.04.3.sif /bin/bash -c "source /ext3/env.sh;
conda activate;
python run_v5.py --output_name fo --base_model_name {} --scoring_model_name facebook/opt-1.3b --mask_filling_model_name t5-3b --n_perturbation_list 50 --n_samples 200 --pct_words_masked 0.3 --span_length 2 --checkpoint_dir {} --ft-epoch {} --skip_baselines
"
"""

In [4]:
import json

with open("checkpoint_dict.json", "rb") as f:
    cp_d = json.load(f)

In [11]:
models = ["gpt2-xl", "EleutherAI/gpt-neo-2.7B", "EleutherAI/gpt-j-6B", "EleutherAI/gpt-neox-20b"]

for m in models:
    m_ = m.replace("/", "_")
    checkpoint_list = cp_d[m_]
    for idx, cp in enumerate(checkpoint_list):
        epoch = 2*(idx+1)
        cp_path = f"ft-opt/{m_}/{cp}"
        temp_templete = templete.format(epoch, m_, m, cp_path, epoch)
        fname = f"ft-opt-slurms/{m_}-ep{epoch}.sh"
        with open(fname, "w") as f:
            f.write(temp_templete)

In [12]:
for m in models:
    m_ = m.replace("/", "_")
    for idx, cp in enumerate(checkpoint_list):
        epoch = 2*(idx+1)
        fname = f"ft-opt-slurms/{m_}-ep{epoch}.sh"
        print("sbatch", fname)

sbatch ft-opt-slurms/gpt2-xl-ep2.sh
sbatch ft-opt-slurms/gpt2-xl-ep4.sh
sbatch ft-opt-slurms/gpt2-xl-ep6.sh
sbatch ft-opt-slurms/gpt2-xl-ep8.sh
sbatch ft-opt-slurms/gpt2-xl-ep10.sh
sbatch ft-opt-slurms/EleutherAI_gpt-neo-2.7B-ep2.sh
sbatch ft-opt-slurms/EleutherAI_gpt-neo-2.7B-ep4.sh
sbatch ft-opt-slurms/EleutherAI_gpt-neo-2.7B-ep6.sh
sbatch ft-opt-slurms/EleutherAI_gpt-neo-2.7B-ep8.sh
sbatch ft-opt-slurms/EleutherAI_gpt-neo-2.7B-ep10.sh
sbatch ft-opt-slurms/EleutherAI_gpt-j-6B-ep2.sh
sbatch ft-opt-slurms/EleutherAI_gpt-j-6B-ep4.sh
sbatch ft-opt-slurms/EleutherAI_gpt-j-6B-ep6.sh
sbatch ft-opt-slurms/EleutherAI_gpt-j-6B-ep8.sh
sbatch ft-opt-slurms/EleutherAI_gpt-j-6B-ep10.sh
sbatch ft-opt-slurms/EleutherAI_gpt-neox-20b-ep2.sh
sbatch ft-opt-slurms/EleutherAI_gpt-neox-20b-ep4.sh
sbatch ft-opt-slurms/EleutherAI_gpt-neox-20b-ep6.sh
sbatch ft-opt-slurms/EleutherAI_gpt-neox-20b-ep8.sh
sbatch ft-opt-slurms/EleutherAI_gpt-neox-20b-ep10.sh


In [21]:
import os
path = "results/fo"
for f in os.listdir(path):
    source = f.split("-")[0]
    model = "-".join(f.split("-")[:-4])
    for sf in os.listdir(os.path.join(path, f)):
        epoch = sf.split("-")[-1]
        for dataset in ["xsum", "squad", "writing"]:
            with open(os.path.join(path, f, sf, dataset, "perturbation_50_z_results.json"), "rb") as file:
                acc = json.load(file)["accuracy"]
            print(", ".join([model, epoch, dataset, str(acc)]))
            

EleutherAI_gpt-j-6B, 6, xsum, 0.529
EleutherAI_gpt-j-6B, 6, squad, 0.5112179487179487
EleutherAI_gpt-j-6B, 6, writing, 0.571
EleutherAI_gpt-j-6B, 2, xsum, 0.562
EleutherAI_gpt-j-6B, 2, squad, 0.5480769230769231
EleutherAI_gpt-j-6B, 2, writing, 0.643
EleutherAI_gpt-j-6B, 8, xsum, 0.534
EleutherAI_gpt-j-6B, 8, squad, 0.5192307692307693
EleutherAI_gpt-j-6B, 8, writing, 0.552
EleutherAI_gpt-j-6B, 4, xsum, 0.544
EleutherAI_gpt-j-6B, 4, squad, 0.530448717948718
EleutherAI_gpt-j-6B, 4, writing, 0.573
EleutherAI_gpt-j-6B, 10, xsum, 0.539
EleutherAI_gpt-j-6B, 10, squad, 0.5192307692307693
EleutherAI_gpt-j-6B, 10, writing, 0.55
EleutherAI_gpt-neox-20b, 6, xsum, 0.539
EleutherAI_gpt-neox-20b, 6, squad, 0.530448717948718
EleutherAI_gpt-neox-20b, 6, writing, 0.551
EleutherAI_gpt-neox-20b, 4, xsum, 0.544
EleutherAI_gpt-neox-20b, 4, squad, 0.5208333333333334
EleutherAI_gpt-neox-20b, 4, writing, 0.556
EleutherAI_gpt-neox-20b, 2, xsum, 0.539
EleutherAI_gpt-neox-20b, 2, squad, 0.5464743589743589
Eleuthe