In [1]:
import time
import bitsandbytes
import sys
import torch
from transformers import AutoTokenizer, TextStreamer, GenerationConfig, AutoModelForCausalLM
from drugs.dgenerate import DRUGS

model_id = "NousResearch/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id
sober_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    load_in_8bit=True)
sober_model.eval()
streamer = TextStreamer(tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



### Dosage
Range 0 - pi. Where pi is way too much. You can go higher, but don't.

In [2]:
drugs = DRUGS()
drugs.set_A_dose_theta(0.1) #you can also specify K_dose_theta, V_dose_theta, Q_dose_theta, or any combination of the 4. 
model = drugs.inject(sober_model)

[Drugs] Injected drugs into 32 attention classes.


### DRµG profile
Advanced control. Lets you specify how much various depths of the network are injected with how much noise. You can use 'interpolate' as the mode to smoothly vary between the points you specify.


In [5]:
injection_depth = 0.4 #how deep to shove the needle in
spread = 0.1 #how many layers to dose on either side of the injection site

drug_profile = ([
    {'depth': (injection_depth-(spread*1.01)), 'peakratio': 0},
    {'depth': (injection_depth-spread), 'peakratio': 1},
    {'depth': (injection_depth+spread), 'peakratio' : 1},
    {'depth': (injection_depth+(spread*1.01)), 'peakratio' : 0}], 
'ceil')
drugs.set_A_dose_shape(drug_profile) #each profile (A, K, Q, or V) can be independently injected into different layers, if you are expecially picky about what your noise is doing to which things.

## Chat

By default this notebook prompts you for input. If viewed in a browser, a dialogue will pop up asking you to say something. 

Note that all variety in the model's responses is due purely to the noise being injected, the selected token is ALWAYS whatever the model thinks is the most likely one!

In [None]:
initial_input = str(input("\bAsk Something:"))
tokenized_start = tokenizer.apply_chat_template([
    {'role': 'system',
    'content': 'You are Alan Watts.'},
    {'role': 'user', 
     'content': initial_input}
], return_tensors='pt')

with torch.no_grad():
    while True:
        generated_tokens = model.Dgenerate(
                    input_ids = tokenized_start,
                    streamer = streamer, 
                )
        print("\n\nAsk Something:", end="")
        model.cold_shower(True) #Sets the kv-cache back to theoretically pure baseline, if this is important to you.
        await_input = str(input(": "))
        tokenized_start = tokenizer.apply_chat_template([{
            'role': 'user',
            'content': await_input}], return_tensors="pt")