**Welcome to the KoboldAI Colab Service  GPT-J-6B Rev 2 Notebook!**<br/>
*Note: This colab is intended to be used with the KoboldAI Client, [which can be downloaded from GitHub here](https://github.com/KoboldAI/KoboldAI-Client).*

**Things you will need:**<br/>
- A Google Drive account
- A copy of the Torch-converted [GPT-J-6B checkpoint from here](https://drive.google.com/file/d/1NXP75l1Xa5s9K18yf3qLoZcR6p4Wced1/view?usp=sharing)

**Preparation:**<br/>
Click the link to the checkpoint archive and make a shortcut to it using the "Add shortcut to Drive" button in the top right corner.

In [None]:
#@title <b>Step 1 - Install Dependencies</b>
#@markdown Press the Play button and wait for the script to finish.
from IPython.display import clear_output
from termcolor import colored

!pip install flask-ngrok
!pip install git+https://github.com/finetuneanon/transformers@gpt-neo-localattention3
!pip install termcolor
!pip install flask_cloudflared
clear_output()
print(colored("Installing DONE!", "green"))

In [None]:
#@title <b>Step 2 - Adjust Your Settings</b>
#@markdown 1. Connect via Ngrok or Cloudflare?
connect_method = "Ngrok" #@param ["Ngrok", "Cloudflare"]
#@markdown 2. Press Play button to lock in settings <b>(Do not skip!)</b>

In [None]:
#@title <b>Step 3 - Initialize Model</b> { display-mode: "form" }
#@markdown Press the Play button. You will be asked to link your 
#@markdown Google Drive account to this colab. Click the link that 
#@markdown pops up, give Drive permission at the prompt, then copy 
#@markdown the access code that's presented back into this notebook.
#@markdown <br/><br/>
#@markdown This notebook will then extract the tar file and initialize 
#@markdown the AI model. <b>This will take several minutes.</b>
#@markdown When the word DONE! is displayed, you can move on to
#@markdown the next Step.

from flask import Flask, redirect, url_for, request
import json
import torch
import requests
import subprocess
from transformers import AutoConfig, GPTNeoForCausalLM, AutoTokenizer
import tarfile
from google.colab import drive
import os
import re
import time
from threading import Timer

if connect_method == "Cloudflare":
   from flask_cloudflared import run_with_cloudflared
elif connect_method == "Ngrok":
   from flask_ngrok import run_with_ngrok

model         = None
tokenizer     = None

if not os.path.isdir("j6b_ckpt"):
    # Get connected to Google Drive
    print(colored("Requesting Google Drive access...", "magenta"))
    drive.mount('/content/drive/')
    # Set path to tar file and unpack it
    model_gdrive = "/content/drive/MyDrive/j6b_ckpt.tar"
    print(colored("Unpacking tar file, please wait...", "magenta"))
    tar = tarfile.open(model_gdrive, "r")
    tar.extractall()
    tar.close()

# Initialize the model
print(colored("Initializing model, please wait...", "magenta"))
config = AutoConfig.from_pretrained("EleutherAI/gpt-neo-2.7B")
config.attention_layers = ["global"] * 28
config.attention_types = [["global"], 28]
config.num_layers = 28
config.num_heads = 16
config.hidden_size = 256 * config.num_heads
config.vocab_size = 50400
config.rotary = True
config.rotary_dim = 64
config.jax = True

try:
    from collections.abc import MutableMapping
except ImportError:
    from collections import MutableMapping

class Checkpoint(MutableMapping):
    def __init__(self):
        self.checkpoint = torch.load("j6b_ckpt/m.pt", map_location="cpu")
    def __len__(self):
        return len(self.checkpoint)
    def __getitem__(self, key):
        return torch.load(self.checkpoint[key], map_location="cpu")
    def __setitem__(self, key, value):
        return
    def __delitem__(self, key, value):
        return
    def keys(self):
        return self.checkpoint.keys()
    def __iter__(self):
        for key in self.checkpoint:
            yield (key, self.__getitem__(key))
    def __copy__(self):
        return Checkpoint()
    def copy(self):
        return Checkpoint()

model = GPTNeoForCausalLM.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=Checkpoint())

# Initialize the tokenizer and set up the bad_words_ids to exclude Author's Note tags
tokenizer     = AutoTokenizer.from_pretrained("gpt2")
vocab         = tokenizer.get_vocab()
vocab_keys    = vocab.keys()
find_keys     = lambda char : [key for key in vocab_keys if key.find(char) != -1]
bad_words     = []
bad_words_ids = []

bad_words.extend(find_keys("["))
bad_words.extend(find_keys(" ["))
bad_words.extend(find_keys("<|endoftext|>"))
for key in bad_words:
  bad_id = vocab[key]
  bad_words_ids.append([bad_id])

clear_output()
print(colored("DONE!", "green"))

In [None]:
#@title <b>Step 4 - Run Web Service</b> { display-mode: "form" }
#@markdown Press the Play button. Flask will start and give you a 
#@markdown Cloudflare or Ngrok address which looks like this:<br/>
#@markdown <i>https://\<unique id\>.trycloudflare.com/</i><br/>
#@markdown <i>http://\<unique id\>.ngrok.io/</i><br/>
#@markdown You will need to right-click this and copy the address.
#@markdown Start the KoboldAI Client on your computer and choose 
#@markdown Google Colab as the model. You will be asked to paste 
#@markdown the address into the terminal.<br/><br/>
#@markdown If your session is interrupted, you can just restart
#@markdown this cell to get a new address without reinitializing
#@markdown the model.</br></br>
#@markdown <b>The first generation takes around a minute due to 
#@markdown compilation, but after that it should take less than 
#@markdown 10 seconds per sample.</b>

app = Flask(__name__)

if connect_method == "Cloudflare":
   run_with_cloudflared(app)
elif connect_method == "Ngrok":
   run_with_ngrok(app)

@app.route("/")
def home():
    return "<h1>KoboldAI Colab Service Running!</h1>"

@app.route('/request',methods = ['POST'])
def koboldrequest():
   if request.method == 'POST':
      try:
        clear_output()
        js      = request.json
        txt     = js["text"]
        min     = js["min"]
        max     = js["max"]
        rep_pen = js["rep_pen"]
        temp    = js["temperature"]
        top_p   = js["top_p"]
        top_k   = js["top_k"]
        tfs     = js["tfs"]

        top_p = top_p if top_p > 0.0 else None
        top_k = top_k if top_k > 0 else None
        tfs = tfs if tfs > 0.0 else None

        # Compatability with un-updated clients
        if("numseqs" in js):
          numseqs = js["numseqs"]
        else:
          numseqs = 1

        if("retfultxt" in js):
          retfultxt = js["retfultxt"]
        else:
          retfultxt = True

        print(colored("Received Data: {0}".format(txt), "yellow"))

        torch.cuda.empty_cache()
        print(colored("Generating text, please wait...", "green"))

        tokens = tokenizer(txt, return_tensors="pt").input_ids.to("cpu")
        ids = tokens.cuda()

        gen_tokens = model.generate(
              ids.long().cuda(),
              do_sample=True,
              min_length=min,
              max_length=max,
              temperature=temp,
              top_p = top_p,
              top_k = top_k,
              tfs = tfs,
              repetition_penalty = rep_pen,
              use_cache=True,
              bad_words_ids=bad_words_ids,
              num_return_sequences=numseqs
          ).long()

        genout = []
        for tkns in gen_tokens:
          if(not retfultxt):
            # Strip context tokens out of returned sequences
            dif = (len(tkns) - len(tokens[0])) * -1
            tkns = tkns[dif:]
          tkns = list(filter(lambda a: a != 50256, tkns))
          genout.append(tokenizer.decode(tkns))
        torch.cuda.empty_cache()

        if(len(genout) > 0 and genout[0] != ""):
          if(retfultxt):
            # Outdated client, send old JSON format
            print(colored("Generated Text: {0}".format(genout[0]), "cyan"))
            response = app.response_class(
              response=json.dumps({"data": {"text": genout[0]}}),
              status=200,
              mimetype='application/json'
            )
          else:
            # New client format with numseq support
            i = 0
            for seq in genout:
              print(colored("[Result {0}]\n{1}".format(i, seq), "cyan"))
              i += 1
            response = app.response_class(
              response=json.dumps({"data": {"seqs": genout}}),
              status=200,
              mimetype='application/json'
            )

          return response
        else:
          print(colored("[ERROR] Something went wrong during generation!", "red"))
          response = app.response_class(
            response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation!"}}}),
            status=400,
            mimetype='application/json'
          )
        
        js         = {}
        tokens     = []
        ids        = []
        gen_tokens = []
        genout     = ""
        response   = {}

      except Exception as e:
        print(colored("[ERROR] Something went wrong during generation!", "red"))
        print(colored("{0}".format(e), "red"))
        response = app.response_class(
          response=json.dumps({"error": {"extensions": {"code": "Something went wrong during generation! {0}".format(e)}}}),
          status=400,
          mimetype='application/json'
        )

print(colored("Starup complete! Running web service.", "green"))
app.run()