In [None]:
#@markdown # Easy Softprompt Tuner
from google.colab import drive
drive.mount('/content/drive/')

!git clone https://github.com/ve-forbryderne/mtj-softtuner
!bash mtj-softtuner/install.sh

!apt install aria2 -y

from mtj_softtuner import BasicTrainer

# Change this to an integer (e.g. 1) if you want trainer.data to persist after
# the Colab runtime is restarted
universe = None

# Changing this to True causes traceback of certain error messages to be hidden
quiet = False

trainer = BasicTrainer(universe, quiet=quiet)

# Path to a Mesh Transformer JAX model, or the model ID of a Hugging Face model
# such as "KoboldAI/fairseq-dense-13B"
#@markdown Select a model or pick one from Huggingface (GPT-Neo, J and XGLM based models are supported)
trainer.data.ckpt_path = "togethercomputer/GPT-JT-6B-v1" #@param ["EleutherAI/gpt-j-6B", "KoboldAI/fairseq-dense-13B", "EleutherAI/gpt-neo-2.7B", "EleutherAI/gpt-neo-1.3B", "facebook/opt-13b", "facebook/opt-6.7b"] {allow-input: true}
trainer.get_hf_checkpoint_metadata()

# Location of the save file (if the file does not exist it will be created), you
# can specify the path to an existing save file created by mtj-softtuner to
# continue from an earlier point in the training
#@markdown Select a save for your prompt, you can leave this default but if you wish to start a new softprompt the file should not exist.
trainer.data.save_file = "/content/drive/MyDrive/newsoftprompt.mtjsp" #@param["/content/drive/MyDrive/softprompt.mtjsp"] {allow-input: true}

# Set the initial soft prompt string, this will be ignored if we are continuing
# from an existing save file
#@markdown Optionally add the location of a training prompt txt containing a high quality snippet of your dataset around 70 words.
trainingprompt = "" #@param ["/content/drive/MyDrive/prompt.txt", ""] {allow-input: true}
if not trainingprompt:
  trainer.data.prompt_method = "vocab_sample"
  #@markdown In case you left the prompt blank you can specify here how many tokens your prompt takes up (Larger is more knowledge but takes up more space from the story context when you use your softprompt)
  trainer.data.soft_in_dim = 128 #@param ["20", "40", "60", "80"] {type:"raw", allow-input: true}
else:
  with open(trainingprompt) as f:
	  initial_softprompt = f.read()

  tokenizer = trainer.get_tokenizer()
  if trainer.data.newlinemode == "s":  # Handle fairseq-style newlines if required
      initial_softprompt = initial_softprompt.replace("\n", "</s>")
  trainer.data.initial_softprompt = tokenizer.encode(
      initial_softprompt, max_length=int(2e9), truncation=True
  )

# Do this to generate an NPY file for your dataset if you haven't already done so
#@markdown You will need a dataset that contains your stories in plain text .txt files, adjust the location if neccesary (Unicode not supported, unix line endings recommended).
dataset_path = "/content/data.txt"  #@param ["/content/drive/MyDrive/dataset/"] {allow-input: true}
output_file = "/content/dataset.npy"
#@markdown For 13B adjust the batch size to 400, everything else can remain 2048
batch_size = 224 #@param ["2048", "400"] {type:"raw", allow-input: true}
#@markdown For most use cases one epoch is enough, increase if you use a very small dataset.
epochs =  256#@param {type:"raw", allow-input: true}
trainer.tokenize_dataset(dataset_path, output_file, batch_size, epochs)

dataset_file = output_file
trainer.data.dataset_file = dataset_file
#@markdown If you picked a small batch size increase this value accordingly (Suggested values are in the dropdown)
trainer.data.gradient_accumulation_steps = 16 #@param ["16", "64"] {type:"raw", allow-input: true}

# Set training hyperparameters here; see the demo notebook for explanation of
# what these mean
#@markdown Adjusting the learning rate effects how strongly the AI learns from the data, 3e-5 is a safe default. Only adjust if your softprompt breaks during training.
learning_rate = 3e-5 #@param ["3e-5"] {type:"raw", allow-input: true}
trainer.data.stparams = {
    "lr": learning_rate,
    "max_grad_norm": 10.0,
    "weight_decay": 0.1,
    "warmup": 0.1,
    "end_lr_multiplier": 0.1,
    "save_every": 10,
}

# Now, begin training!
trainer.train()

# Export to KoboldAI/mkultra format
#@markdown And finally pick a name and description for your softprompt
output_file = "/content/drive/MyDrive/KoboldAI/softprompts/my_softprompt.zip" #@param ["/content/drive/MyDrive/my_softprompt.zip"] {allow-input: true}
name = "NSFW Content Warning" #@param ["Your Softprompt Name Here"] {allow-input: true}
author = "Julian Herrera/Puffy310@BirdL" #@param [""] {allow-input: true}
supported = trainer.data.ckpt_path #param ["Generic 2.7B", "Generic 6B", "Generic 13B"] {allow-input: true}
description = "Serves as a base for content warning systems" #@param ["Your Softprompt Description Here"] {allow-input: true}
trainer.export_to_kobold(output_file, name, author, supported, description)
output_file = "/content/drive/MyDrive/my_softprompt.json"
soft_prompt_name = name
soft_prompt_description = supported + " - " + description
trainer.export_to_mkultra(output_file, soft_prompt_name, soft_prompt_description)

