# Let us do sample inferences by connecting to the public swarm, hosted on petals swarm.


## Installing the dependencies


In [None]:
%pip install git+https://github.com/bigscience-workshop/petals

## To connect to Public Swarm, we specify the model name here

In [None]:
import torch
from transformers import AutoTokenizer
from petals import AutoDistributedModelForCausalLM

model_name = "petals-team/StableBeluga2"
# You can also use any other supported model from 🤗 Model Hub

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, add_bos_token=False)
model = AutoDistributedModelForCausalLM.from_pretrained(model_name)
model = model.cuda()

## We do a simple inference after connecting to the swarm

Let's try to generate something by calling __`model.generate()`__ method.

The first call to this method takes a few seconds to connect to the Petals swarm. Once we do that, you should expect generation speed of up to **5-6 tokens/sec**. If you don't have enough GPU memory to host the entire model, this is much faster than what you get with other methods, such as offloading or running the model on CPU.


In [None]:
inputs = tokenizer('A cat in French is "', return_tensors="pt")["input_ids"].cuda()
outputs = model.generate(inputs, max_new_tokens=3)
print(tokenizer.decode(outputs[0]))

## Let's make a chatbot

If you'd like to talk to the model in an interactive way, you can use the inference session interface — it allows to print generated tokens on the fly or make a chat bot that responds to human's inputs.

The inference session looks for a sequence of servers to run successive inference steps and store past attention caches. This way, you don't need to rerun previous tokens through the transformer to generate each phrase. If one of the servers disconnects or fails, Petals will automatically find a replacement and regenerate only a small part of the caches.

In [None]:
with model.inference_session(max_length=512) as sess:
    while True:
        prompt = input('Human: ')
        if prompt == "":
            break
        prefix = f"Human: {prompt}\nFriendly AI:"
        prefix = tokenizer(prefix, return_tensors="pt")["input_ids"].cuda()
        print("Friendly AI:", end="", flush=True)

        while True:
            outputs = model.generate(prefix, max_new_tokens=1, session=sess,
                                     do_sample=True, temperature=0.9, top_p=0.6)
            outputs = tokenizer.decode([fake_token, outputs[0, -1].item()])[1:]

            # Now, let's print one new token at a time
            print(outputs, end="", flush=True)

            if "\n" in outputs or "</s>" in outputs:
                break
            prefix = None  # Prefix is passed only for the 1st token of the bot's response

### How does it work?
The model you are running is equal to the original model, but only a part of it is loaded into your machine's GPU. Let's have a look under the hood:

In [None]:
model

As you can see, word embeddings and some other layers are regular PyTorch modules hosted on your machine, but the rest of the model (e.g., transformers blocks) is encased in the RemoteSequential class. This is an advanced PyTorch module that runs on a distributed swarm of other machines.

Still, you can access individual layers and their outputs, as well as run forward/backward through them:



### Now let us run a benchmark test like MMLU Test to check the quality of our model

All the helper functions for the benchmarking test

In [None]:
import argparse
import json
import os
import time

import pandas as pd
import tensor_parallel as tp
import torch
from tqdm import tqdm

TASKS = [
        'global_facts',
        'high_school_computer_science',
        'high_school_mathematics',
        'high_school_physics',
        'machine_learning',
        'miscellaneous',
        'moral_disputes']

choices = ["A", "B", "C", "D"]

def compute_metric(output_filename):
    with open(output_filename, 'r') as f:
        run_results = json.load(f)
    total_acc = 0
    total_num = 0
    for task in run_results:
        acc = 0
        pred_answers = run_results[task]['pred_answers']
        gold_answers = run_results[task]['gold_answers']
        for pred, gold in zip(pred_answers, gold_answers):
            if pred == gold: acc += 1
        print("ACC-%s: %.4f" % (task, acc/len(gold_answers)))
        total_acc += acc
        total_num += len(gold_answers)
    print("ACC-all: %.4f" % (total_acc/total_num))


def format_subject(subject):
    l = subject.split("_")
    s = ""
    for entry in l:
        s += " " + entry
    return s

def format_example(df, idx, include_answer=True):
    prompt = df.iloc[idx, 0]
    k = df.shape[1] - 2
    for j in range(k):
        prompt += "\n{}. {}".format(choices[j], df.iloc[idx, j+1])
    prompt += "\nAnswer:"
    if include_answer:
        prompt += " {}\n\n".format(df.iloc[idx, k + 1])
    return prompt

def gen_prompt(train_df, subject, k=-1):
    prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(format_subject(subject))
    if k == -1:
        k = train_df.shape[0]
    for i in range(k):
        prompt += format_example(train_df, i)
    return prompt


# def custom_stopping_criteria(input_ids, score, **kwargs):
#     stop_ids = [29871, 13, 13] # \n\n
#     return input_ids[-len(stop_ids)]


def load(model_type):
    n_gpus = torch.cuda.device_count()

    if model_type == 'llama':
        # we use tensor parallel for loading llama
        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, add_bos_token=False, use_auth_token="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")

        model = AutoDistributedModelForCausalLM.from_pretrained(model_name, use_auth_token="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")
        model = model.cuda()

        tokenizer.pad_token_id = 0 if tokenizer.pad_token_id is None else tokenizer.pad_token_id
        tokenizer.bos_token_id = 1


    model.eval()

    return model, tokenizer



def serial_infer(model, tokenizer, prompts):
    answers = []
    for prompt in tqdm(prompts):
        inputs = tokenizer(prompt, return_tensors="pt")["input_ids"].to('cuda')
        output = model.generate(inputs, max_new_tokens=1, pad_token_id=tokenizer.pad_token_id, attention_mask=None)
        answer = tokenizer.decode(output[0], skip_special_tokens=True)
        answers.append(answer[-1])
    return answers


Main function that will run the benchmark test


In [None]:
data_dir= 'Cloud_Computing_Project/data'
param_size='70B'
model_type='llama'
run_results = {}
output_filename = 'run_results_%s_%sb.json' % (model_type, param_size)
model, tokenizer = load(model_type)
start_time = time.time()

for task in TASKS:
    print('Testing %s ...' % task)
    records = []
    dev_df = pd.read_csv(os.path.join(data_dir, "dev", task + "_dev.csv"), header=None)[:5]
    test_df = pd.read_csv(os.path.join(data_dir, "test", task + "_test.csv"), header=None)

    for i in range(test_df.shape[0]):
            # get prompt and make sure it fits
        k = 5
        prompt_end = format_example(test_df, i, include_answer=False)
        train_prompt = gen_prompt(dev_df, task, k)
        prompt = train_prompt + prompt_end
        while len(tokenizer.tokenize(prompt)) + 1 > 2048:  # bos token
            prompt_split = prompt.split("\n\n")
            prompt_split.pop(1)
            prompt = '\n\n'.join(prompt_split)

        label = test_df.iloc[i, test_df.shape[1] - 1]
        records.append({'prompt': prompt, 'answer': label})

        pred_answers = []
        for record in records:
            inputs = tokenizer(record['prompt'], return_tensors="pt")["input_ids"].to('cuda')
            output = model.generate(inputs, max_new_tokens=1, pad_token_id=tokenizer.pad_token_id, attention_mask=None)
            answer = tokenizer.decode(output[0], skip_special_tokens=True)
            pred_answers.append(answer[-1])

        gold_answers = [record['answer'] for record in records]
        run_results[task] = {'pred_answers': pred_answers, 'gold_answers': gold_answers}

## Results can be seen here

In [None]:
with open(output_filename, 'w') as f:
    json.dump(run_results, f, ensure_ascii=False, indent=2)

compute_metric(output_filename)
end_time = time.time()
print("total run time %.2f" % (end_time - start_time))