In [1]:
%%time
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

from dataclasses import dataclass, field
from typing import Optional
import contextlib

import torch
from datasets import load_dataset
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
)


from trl import SFTTrainer

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)

# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype="bfloat16",
#     bnb_4bit_use_double_quant=False,
# )

# device_map = {"": 0}
model = "bigcode/starcoderplus"
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model, quantization_config=None, 
    device_map=None, 
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16,
)

model_id = "smangrul/peft-lora-starcoderplus-chat-asst-A100-40GB-colab"
model = PeftModel.from_pretrained(model, model_id, adapter_name="assistant")

model_id = "smangrul/peft-lora-starcoderplus-personal-copilot-A100-40GB-colab"
_ = model.load_adapter(model_id, adapter_name="copilot")

if not hasattr(model, "hf_device_map"):
    model.cuda()

[2023-08-01 03:18:32,126] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

CPU times: user 6min 54s, sys: 52.6 s, total: 7min 47s
Wall time: 4min 59s


In [124]:
system_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully \
as possible, while being safe. Your answers should not include any harmful, \
unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that \
your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why \
instead of answering something not correct. If you don’t know the answer to a \
question, please don’t share false information."""

def get_model_pred(query, disable=False):
    context = contextlib.nullcontext
    if disable:
        context = model.disable_adapter
    text = prompt = f"<|system|> {system_prompt} <|endoftext|> <|prompter|> {query} <|endoftext|> <|assistant|>"
    model.eval()
    with context():
        outputs = model.generate(input_ids=tokenizer(text, return_tensors="pt").input_ids.cuda(), 
                                 max_new_tokens=512,
                                 temperature=0.01,
                                 top_k=50,
                                 top_p=0.95,
                                 do_sample=True,
                                 repetition_penalty=1.2,
                                 eos_token_id = tokenizer.eos_token_id)
    return tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]



### Disabled Adapters

In [28]:
query = """
How to use scrapy? Explain with an example.
"""
print(get_model_pred(query, disable=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<|system|> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information. <|endoftext|> <|prompter|> 
How to use scrapy? Explain with an example.
 <|endoftext|> <|assistant|> 
1) What's up?

2) How can I be useful for you today? 

3) Do you have anything else on mind? 
 
4) Is there anyone who needs my assistance right now?
 
5) Can i do it later?

6) No problem! Have fun :)

7) Bye bye! See ya soon :D


<|endoftext|>


In [29]:
query = """
Write a code snippet for using LoRA method from 🤗 PEFT.
"""
print(get_model_pred(query, disable=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<|system|> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information. <|endoftext|> <|prompter|> 
Write a code snippet for using LoRA method from 🤗 PEFT.
 <|endoftext|> <|assistant|> 
1. Create an instance of `PefT` with default parameters:
```python
    pef_t = PefT()
```

2. Call `.fit()` on it passing data (X) and labels(y):
```python
    # X - array-like shape=(n_samples,)
    # y - array-like shape=(n_samples,), dtype=int

    model = pef_t.fit(X, y)
```

3. Use trained classifier by calling its methods like this example below:
```python
  

### Enable `Assistant` Adapter

In [75]:
model.set_adapter("assistant")
query = """
How to use scrapy? Explain with an example.
"""
print(get_model_pred(query))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<|system|> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information. <|endoftext|> <|prompter|> 
How to use scrapy? Explain with an example.
 <|endoftext|> <|assistant|> Scrapy is a popular web crawling framework for Python developers. It allows users to extract data from websites using simple but powerful programming constructs such as selectors (XPath/CSS) and spiders. Here's how we can use it:

1. Install Scrapy on your system by running `pip install scrapy`. This will download all necessary dependencies automatically.
2. Create a new projec

In [31]:
query = """
Write a code snippet for using LoRA method from 🤗 PEFT.
"""
print(get_model_pred(query))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<|system|> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information. <|endoftext|> <|prompter|> 
Write a code snippet for using LoRA method from 🤗 PEFT.
 <|endoftext|> <|assistant|> Sure! Here's an example Python script that uses Fairseq's implementation of LoRa:

1. Install required packages by running pip install -r requirements.txt (you may need to use sudo if installing system-wide)
2. Download pre-trained model checkpoint files from https://dl.fbaipublicfiles.com/fairseq/models/lora_base.tar.gz into the same directory as this python file. 

### Enable `Copilot` Adapter


In [32]:
model.set_adapter("copilot")
query = """
How to use scrapy? Explain with an example.
"""
print(get_model_pred(query))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<|system|> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information. <|endoftext|> <|prompter|> 
How to use scrapy? Explain with an example.
 <|endoftext|> <|assistant|> I can't understand what you're asking me. Can you rephrase it?|
I'm trying to learn how to use Scrapy.|
Can someone tell me how to do this?|
What's the best way to get started using Scrapy?|
Is there anything else I need to know before starting?|
Do I have to install anything special on my computer?|
Will I be able to run everything from here without having to download addition

In [33]:
query = """
Write a code snippet for using LoRA method from 🤗 PEFT.
"""
print(get_model_pred(query))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<|system|> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information. <|endoftext|> <|prompter|> 
Write a code snippet for using LoRA method from 🤗 PEFT.
 <|endoftext|> <|assistant|>
A chatbot which can perform various tasks such as translating text into different languages, converting units between metric/imperial systems etc., solving mathematical equations (calculus), generating random numbers within specified ranges, playing chess against itself, returning weather forecasts based on user inputted location data, among others.|
<fim_middle># c

In [51]:
stop_list = ['\n', ':', tokenizer.eos_token]

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids


[[203], [44], [0]]

In [52]:
import torch

stop_token_ids = [torch.LongTensor(x).cuda() for x in stop_token_ids]
stop_token_ids

[tensor([203], device='cuda:0'),
 tensor([44], device='cuda:0'),
 tensor([0], device='cuda:0')]

In [53]:
from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])


In [195]:
def get_code_completion(prefix, suffix, disable=False):
    context = contextlib.nullcontext
    if disable:
        context = model.disable_adapter
    text = prompt = f"""<fim_prefix>{prefix}<fim_suffix>{suffix}<fim_middle>"""
    model.eval()
    with context():
        outputs = model.generate(input_ids=tokenizer(text, return_tensors="pt").input_ids.cuda(), 
                                 max_new_tokens=512,
                                 temperature=0.3,
                                 top_k=50,
                                 top_p=0.95,
                                 do_sample=True,
                                 repetition_penalty=1.2,
                                 #stopping_criteria=stopping_criteria
                                )
    return tokenizer.batch_decode(outputs, skip_special_tokens=False)[0]

### Disabled Adapters

In [92]:
prefix = """
# Here is the correct implementation of the two sum code exercise
# time complexity: O(N)
# space complexity: O(N)
def two_sum(arr, target_sum):
"""

suffix = """"""
print(get_code_completion(prefix, suffix, disable=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<fim_prefix>
# Here is the correct implementation of the two sum code exercise
# time complexity: O(N)
# space complexity: O(N)
def two_sum(arr, target_sum):
<fim_suffix><fim_middle>    dict = {}
    for i in range (len(arr)):
        if arr[i] not in dict.keys():
            dict[target_sum-arr[i]]=i
        else:
            return [dict[arr[i]], i+1]

<|endoftext|>


In [93]:
prefix = """from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, training_dataloader, scheduler = """

suffix = """"""
print(get_code_completion(prefix, suffix, disable=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<fim_prefix>from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, training_dataloader, scheduler = <fim_suffix><fim_middle>accelerator.prepare(
        model=model, 
        optimizer=optimizer,  
        train_loader=train_loader)<jupyter_output><empty_output><|endoftext|>


### Enable `Assistant` Adapter

In [94]:
prefix = """from peft import LoraConfig, TaskType

peft_config = LoraConfig("""

suffix = ")"
model.set_adapter("assistant")
print(get_code_completion(prefix, suffix))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<fim_prefix>from peft import LoraConfig, TaskType

peft_config = LoraConfig(<fim_suffix>)<fim_middle>
    task=TaskType.SEND_PACKETS <|endoftext|>


In [95]:
prefix = """from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, training_dataloader, scheduler = """

suffix = """"""
print(get_code_completion(prefix, suffix))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<fim_prefix>from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, training_dataloader, scheduler = <fim_suffix><fim_middle>accelerator.prepare(
    model=model, 
    optimizer=optimizer, 
    dataloaders={'train': train_loader}, 
    lr_scheduler=scheduler)


for epoch in range(num_epochs):

    # Train for one epoch
    loss = train(model, optimizer, training_dataloader)
    
    # Evaluate the performance on validation set at end of each epoch
    valid_loss = evaluate(model, eval_loader)
    
    if not math.isfinite(valid_loss):
        print("Validation loss is non-finite! Skipping checkpoint.")
        continue
        
    scheduler.step()
    
# Save a trained model
torch.save({'epoch': epoch + 1,'state_dict': model.state_dict(), 'optim_dict' : optimizer.state_dict()}, save_path)<jupyter_output><empty_output><|endoftext|>


### Enable `Copilot` Adapter

In [96]:
prefix = """from peft import LoraConfig, TaskType

peft_config = LoraConfig("""

suffix = ")"
model.set_adapter("copilot")
print(get_code_completion(prefix, suffix))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<fim_prefix>from peft import LoraConfig, TaskType

peft_config = LoraConfig(<fim_suffix>)<fim_middle>task_type=TaskType.SEQ_2_SEQ<|endoftext|>


In [98]:
prefix = """from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, training_dataloader, scheduler = """

suffix = """"""
print(get_code_completion(prefix, suffix))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<fim_prefix>from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, training_dataloader, scheduler = <fim_suffix><fim_middle>accelerator.prepare(
    model=model, optimizer=optimizer, train_dataloader=training_dataloader, lr_scheduler=scheduler)

<|endoftext|>


### Combining Multiple Adapters

In [187]:
%%time
#model.delete_adapter("code_buddy")
model.add_weighted_adapter(["copilot","assistant"], [0.9, 0.6], "code_buddy", combination_type="linear")
model.set_adapter("code_buddy")

CPU times: user 25.2 s, sys: 7.01 ms, total: 25.2 s
Wall time: 440 ms


### Test assistance and conversation ability

In [188]:
query = """
write a function to compute the quantiles of a normal distribution given number of buckets `n`.
"""
print(get_model_pred(query))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<|system|> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information. <|endoftext|> <|prompter|> 
write a function to compute the quantiles of a normal distribution given number of buckets `n`.
 <|endoftext|> <|assistant|> Here's an example using NumPy:
```python
import numpy as np


def get_quantile(arr, n):
    """Returns array containing values at each bucket."""

    # Sort input vector arr from smallest value to largest for numerical stability
    sorted = np.sort(arr)
    
    # Get cumulative sum of probabilities up until current index (i

In [189]:
query = """
Write a code snippet for using LoRA method from 🤗 PEFT.
"""
print(get_model_pred(query))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<|system|> You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don’t know the answer to a question, please don’t share false information. <|endoftext|> <|prompter|> 
Write a code snippet for using LoRA method from 🤗 PEFT.
 <|endoftext|> <|assistant|> Here's an example script:
```python
from transformers import AutoModelForCausalLM, LoraConfig, get_peft_model
import torch


# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("bigscience/T5-Base-v12")
tokenizer = model.get_input_embeddings()

# Create config with desired parameters (here we use T5)
lora_config = LoraConfig(r=32, lora_alpha=0.1,

### Test infilling and code completion ability

In [211]:
prefix = """
# Here is the correct implementation of the two sum code exercise
# time complexity: O(N)
# space complexity: O(N)
def two_sum(arr, target_sum):
"""

suffix = """"""
print(get_code_completion(prefix, suffix, disable=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<fim_prefix>
# Here is the correct implementation of the two sum code exercise
# time complexity: O(N)
# space complexity: O(N)
def two_sum(arr, target_sum):
<fim_suffix><fim_middle>    # initialize a dict to store pairs
    pair = {}
    
    for i in range (len(arr)):
        if arr[i] not in pair.keys():
            pair[target-arr[i]]=arr[i]
            
        else:
            return [pair[target - arr[i]], i+1]<|endoftext|>


In [204]:
prefix = """
from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, training_dataloader, scheduler = """

suffix = """"""
print(get_code_completion(prefix, suffix))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<fim_prefix>
from accelerate import Accelerator

accelerator = Accelerator()

model, optimizer, training_dataloader, scheduler = <fim_suffix><fim_middle>accelerator.prepare(
    model, optimizer, train_dataloader, lr_scheduler=scheduler
)


def main():
    for epoch in range(num_epochs):
        # Train!
        total_loss = 0
        for step, batch in enumerate(training_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.detach().float()

            if (step + 1) % num_update_steps == 0:
                optimizer.zero_grad()

                with torch.no_grad():
                    output = model(**batch).logits

                labels = batch["labels"]
                logits = output[torch.arange(len(output)), labels]
                loss = cross_entropy_loss(logits, labels) / len(labels)

                accelerator.backward(loss)
                optimizer.step()

        # Print the loss at the end of each epoch
      

In [201]:
prefix = """from peft import LoraConfig, TaskType

peft_config = LoraConfig("""

suffix = ")"
model.set_adapter("copilot")
print(get_code_completion(prefix, suffix))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


<fim_prefix>from peft import LoraConfig, TaskType

peft_config = LoraConfig(<fim_suffix>)<fim_middle>task_type=TaskType.SEQ_2_SEQ<|endoftext|>


In [186]:
model.delete_adapter("code_buddy")