In [8]:
from typing import Tuple
import os
import sys
import torch
import time
import json
import fire

from pathlib import Path

from fairscale.nn.model_parallel.initialize import initialize_model_parallel

sys.path.append("/coc/pskynet6/dhe83/mice/src")
import config
from utils import *

sys.path.append(config.llama)
from llama import ModelArgs, Transformer, Tokenizer, LLaMA


def setup_model_parallel() -> Tuple[int, int]:
    local_rank = int(os.environ.get("LOCAL_RANK", -1))
    world_size = int(os.environ.get("WORLD_SIZE", -1))

    torch.distributed.init_process_group("nccl")
    initialize_model_parallel(world_size)
    torch.cuda.set_device(local_rank)

    # seed must be the same in all processes
    torch.manual_seed(1)
    return local_rank, world_size


def load(
    ckpt_dir: str,
    local_rank: int,
    world_size: int,
    max_seq_len: int,
    max_batch_size: int,
) -> LLaMA:
    tokenizer_path = os.path.join(config.llama,"checkpoints",  "tokenizer.model")

    start_time = time.time()
    checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
    assert world_size == len(
        checkpoints
    ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {world_size}"
    ckpt_path = checkpoints[local_rank]
    print("Loading")
    checkpoint = torch.load(ckpt_path, map_location="cpu")
    with open(Path(ckpt_dir) / "params.json", "r") as f:
        params = json.loads(f.read())

    model_args: ModelArgs = ModelArgs(
        max_seq_len=max_seq_len, max_batch_size=max_batch_size, **params
    )
    tokenizer = Tokenizer(model_path=tokenizer_path)
    model_args.vocab_size = tokenizer.n_words
    torch.set_default_tensor_type(torch.cuda.HalfTensor)
    model = Transformer(model_args)
    torch.set_default_tensor_type(torch.FloatTensor)
    model.load_state_dict(checkpoint, strict=False)

    generator = LLaMA(model, tokenizer)
    print(f"Loaded in {time.time() - start_time:.2f} seconds")
    return generator




In [10]:
ckpt_dir = os.path.join(config.llama, "checkpoints", "7B")

local_rank, world_size = setup_model_parallel()
if local_rank > 0:
    sys.stdout = open(os.devnull, "w")
model = load(ckpt_dir, local_rank, world_size, max_seq_len, max_batch_size)

ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable RANK expected, but not set

In [None]:
results = model.generate(
    [premise], max_gen_len=256, temperature=temperature, top_p=top_p
)

for result in results:
    print(result)
    print("\n==================================\n")

In [1]:
import os
import sys
import torch
import transformers
from peft import PeftModel
from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

base_model = "decapoda-research/llama-7b-hf"


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /srv/nlprx-lab/share6/dhe83/miniconda3/envs/mice/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /srv/nlprx-lab/share6/dhe83/miniconda3/envs/mice/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /srv/nlprx-lab/share6/dhe83/miniconda3/envs/mice/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


In [2]:
tokenizer = LlamaTokenizer.from_pretrained(base_model, padding_side="left", use_fast=False)
model = LlamaForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
model = PeftModel.from_pretrained(
    model,
    "tloen/alpaca-lora-7b",
    torch_dtype=torch.float16,
)

# unwind broken decapoda-research config
model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2

model.eval()
if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [17]:
prompts = ["Tell me about alpacas"]
tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(device)

In [18]:
tokens

{'input_ids': tensor([[    0, 24948,   592,  1048,   394, 29886,   562,   294]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [42]:
with torch.no_grad():
    outputs = model.generate(
        input_ids=tokens.input_ids,
        attention_mask=torch.zeros(tokens.input_ids.shape),
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)

In [43]:
print(tokenizer.batch_decode(outputs.sequences))

['<unk>Tell me about alpacas, and Below the Line, and the 2016 Academy Award-winning film Spotlight.\nThe film follows the story of a group of journalists from The Boston Globe who uncovered a massive cover-up of child abuse and the systemic failure of one of the most powerful institutions in the world: the Catholic Church.\nThe film stars Mark Ruffalo, Michael Keaton, Rachel McAdams, Liev Schreiber, John Sl']


In [40]:
with torch.no_grad():
    outputs = model.generate(
        input_ids=tokens.input_ids[:, :-1],
        attention_mask=torch.zeros(tokens.input_ids[:, :-1].shape),
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)

In [41]:
print(tokenizer.batch_decode(outputs.sequences))

['<unk>Tell me about alpac, and Below the Line.\nThe first two are the most important. Above the Line is the part of the brain that is responsible for rational thought, logic, and problem solving. Below the Line is the part of the brain that is responsible for emotions, feelings, and instincts.\nThe Above the Line is the part of the brain that is responsible for rational thought, logic, and problem solving.\nThe Below the Line is the part of the brain that is responsible for']


In [63]:
prompts = [" ", " hi"]

In [64]:
tokenizer = LlamaTokenizer.from_pretrained(base_model, padding_side='left', use_fast=False)
model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
model.config.bos_token_id = 1
model.config.eos_token_id = 2
tokens = tokenizer(prompts, padding=True, return_tensors="pt").to(device)
tokens

normalizer.cc(51) LOG(INFO) precompiled_charsmap is empty. use identity normalization.


{'input_ids': tensor([[    0,     0,   259],
        [    0, 29871,  7251]], device='cuda:0'), 'attention_mask': tensor([[0, 1, 1],
        [1, 1, 1]], device='cuda:0')}

In [65]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-6.7b", use_fast=False)
tokens = tokenizer(prompts, padding=False, return_tensors="pt").to(device)
tokens

{'input_ids': tensor([[    2,  1437],
        [    2, 20280]], device='cuda:0'), 'attention_mask': tensor([[1, 1],
        [1, 1]], device='cuda:0')}

In [67]:
tokenizer.batch_decode(tokens.input_ids)

['</s> ', '</s> hi']

In [48]:
mask = torch.tensor(tokenizer.get_special_tokens_mask(tokens.input_ids))
zero_indices = mask == 0
non_zero_indices = mask != 0
mask[non_zero_indices] = 0
mask[zero_indices] = 1
mask

tensor([0, 1])

In [38]:
with torch.no_grad():
    outputs = model.generate(
        input_ids=tokens.input_ids,
        temperature=0,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
    print(tokenizer.batch_decode(outputs.sequences))

[' ⁇    Below is an example of a valid JSON object.\n  \n{\n  "name": "John",\n  "age": 30\n}\n\n// Define a function that takes a JSON object and returns a string.\nfunction toString(obj) {\n  return JSON.stringify(obj);\n}\n\n// Create a new object with the given properties.\nfunction createObject(name, age) {\n  return {\n    name: name,\n']


In [33]:
with torch.no_grad():
    outputs = model.generate(
        **tokens,
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
    print(tokenizer.batch_decode(outputs.sequences))

[' ⁇   2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222']


In [7]:
with torch.no_grad():
    outputs = model.generate(
        input_ids=tokens.input_ids,
        attention_mask=torch.zeros(tokens.input_ids.shape),
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
    print(tokenizer.batch_decode(outputs.sequences))

['<unk> \n Below is a code that prints out the number 10.\n#include <stdio.h>\n\nint main()\n{\n    printf("The number is %d\\n", 10);\n    return 0;\n}\n']


In [10]:
torch.tensor([[0,1]])

tensor([[0, 1]])

In [11]:
with torch.no_grad():
    outputs = model.generate(
        input_ids=tokens.input_ids,
        attention_mask=torch.tensor([[0,1]]),
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
    print(tokenizer.batch_decode(outputs.sequences))

['<unk>  Below is an example of a valid JSON object.\n  \n{\n  "name": "John",\n  "age": 30\n}\n\n// Define a function that takes a JSON object and returns a string.\nfunction toString(obj) {\n  return JSON.stringify(obj);\n}\n\n// Create a new object with the given properties.\nfunction createObject(name, age) {\n  return {\n    name: name,\n']


In [8]:
with torch.no_grad():
    outputs = model.generate(
        input_ids=tokens.input_ids,
        attention_mask=torch.ones(tokens.input_ids.shape),
        max_new_tokens=100,
        return_dict_in_generate=True,
        output_scores=True)
    print(tokenizer.batch_decode(outputs.sequences))

['<unk> 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222']


In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(
    "facebook/opt-6.7b",
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-6.7b", padding_side="left", use_fast=False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Exception ignored in: <function tqdm.__del__ at 0x7f8a0bfbfb80>
Traceback (most recent call last):
  File "/srv/nlprx-lab/share6/dhe83/miniconda3/envs/mice/lib/python3.9/site-packages/tqdm/std.py", line 1144, in __del__
    def __del__(self):
KeyboardInterrupt: 


In [15]:
from math import ceil
def batch_inference(model, tokenizer, prompts, batch_size):                                              
 output_tokens = torch.empty(0, dtype=torch.int64).to('cuda:0')                                       

 num_batches = ceil(len(prompts) / batch_size)                                                        

 for batch in range(num_batches):                                                                     
     start = batch * batch_size                                                                       
     end = min((batch + 1) * batch_size, len(prompts))                                                

     # tokenize by batch to mitigate effect of long outliers                                          
     tokens = tokenizer(prompts[start:end], padding=True, return_tensors="pt").to('cuda:0')           
     with torch.no_grad():                                                                            
         outputs = model.generate(                                                                    
             input_ids=tokens.input_ids, 
#              attention_mask=tokens.attention_mask,
             max_new_tokens=100,                                                                        
             temperature=0,                                                                           
             return_dict_in_generate=True,                                                            
             output_scores=True,                                                                      
             eos_token_id=198,  # special character 'ċ' (bytecode for new line?) note use this for    generation                                                                                               
         )                                                                                            
     output_tokens = torch.cat((output_tokens, outputs.sequences[:, -100:]))                      

 output_text = tokenizer.batch_decode(output_tokens)                                                  
 return output_text

In [16]:
batch_inference(model, tokenizer, prompts, 2)

/opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [15,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [15,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [15,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [15,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/conda-bld/pytorch_1670525539683/work/aten/src/ATen/native/cuda/Indexing.cu:1088: indexSelectSmallIndex: block: [15,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/opt/conda/cond

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [1]:
from transformers import GenerationConfig
GenerationConfig.from_pretrained(base_model)

NameError: name 'base_model' is not defined