In [1]:
import deepspeed
import torch
import pandas as pd
import numpy as np
import os
import sys
import re
import transformers
transformers.logging.set_verbosity_error()
import warnings
warnings.filterwarnings('ignore')

from deepspeed.ops.transformer.inference import DeepSpeedTransformerInference
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from time import perf_counter

In [2]:
# check deepspeed installation
report = !python3 -m deepspeed.env_report
r = re.compile('.*ninja.*OKAY.*')
assert any(r.match(line) for line in report) == True, "DeepSpeed Inference not correct installed"

# check cuda and torch version
torch_version, cuda_version = torch.__version__.split("+")
torch_version = ".".join(torch_version.split(".")[:2])
cuda_version = f"{cuda_version[2:4]}.{cuda_version[4:]}"
r = re.compile(f'.*torch.*{torch_version}.*')
assert any(r.match(line) for line in report) == True, "Wrong Torch version"
r = re.compile(f'.*cuda.*{cuda_version}.*')
assert any(r.match(line) for line in report) == True, "Wrong Cuda version"

In [3]:
model = AutoModelForCausalLM.from_pretrained('/data/kiho/autocode/GPTJ/Finetune_GPTNEO_GPTJ6B/finetuning_repo/finetuned/checkpoint-6/')
tokenizer = AutoTokenizer.from_pretrained('/data/kiho/autocode/GPTJ/Finetune_GPTNEO_GPTJ6B/finetuning_repo/finetuned/checkpoint-6/')
# device = torch.device('cuda')
# model.to(device)
print(f'model is loaded on device {model.device.type}')

model is loaded on device cpu


In [4]:
def measure_latency(model, tokenizer, payload, generation_args={},device=model.device):
    input_ids = tokenizer(payload, return_tensors="pt").input_ids.to(device)
    latencies = []
    # warm up
    for _ in range(2):
        _ =  model.generate(input_ids, **generation_args)
    # Timed run
    for _ in range(10):
        start_time = perf_counter()
        _ = model.generate(input_ids, **generation_args)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    time_p95_ms = 1000 * np.percentile(latencies,95)
    return f"latency (ms) - {time_p95_ms}; Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};", time_p95_ms

In [None]:
# model = AutoModelForCausalLM.from_pretrained('/data/kiho/autocode/GPTJ/Finetune_GPTNEO_GPTJ6B/finetuning_repo/finetuned/checkpoint-6/')
# tokenizer = AutoTokenizer.from_pretrained('/data/kiho/autocode/GPTJ/Finetune_GPTNEO_GPTJ6B/finetuning_repo/finetuned/checkpoint-6/')
# # device = torch.device('cuda:3')
# # model.to(device)
# # print(f'model is loaded on device {model.device.type}')

##### Normal text inference

In [None]:
# payload = "Hello my name is Philipp. I am getting in touch with you because i didn't get a response from you. What do I need to do to get my new card which I have requested 2 weeks ago? Please help me and answer this email in the next 7 days. Best regards and have a nice weekend but it"

# input_ids = tokenizer(payload,return_tensors="pt").input_ids.to(model.device)
# print(f"input payload: \n \n{payload}")
# logits = model.generate(input_ids, do_sample=True, num_beams=1, min_length=128, max_new_tokens=128)

# print(f"prediction: \n \n {tokenizer.decode(logits[0].tolist()[len(input_ids[0]):])}")

In [None]:
# payload="Hello my name is Philipp. I am getting in touch with you because i didn't get a response from you. What do I need to do to get my new card which I have requested 2 weeks ago? Please help me and answer this email in the next 7 days. Best regards and have a nice weekend but it"*2
# print(f'Payload sequence length is: {len(tokenizer(payload)["input_ids"])}')

# # generation arguments
# generation_args = dict(
#   do_sample=False,
#   num_beams=1,
#   min_length=128,
#   max_new_tokens=128
# )
# vanilla_results = measure_latency(model,tokenizer,payload,generation_args)

# print(f"Vanilla model: {vanilla_results[0]}")

##### DeepSpeed

In [5]:
# init deepspeed inference engine
ds_model = deepspeed.init_inference(
    model=model,      # Transformers models
    mp_size=1,        # Number of GPU
    dtype=torch.float16, # dtype of the weights (fp16)
    replace_method="auto", # Lets DS autmatically identify the layer to replace
    replace_with_kernel_inject=True, # replace the model with the kernel injector
)
# ds_model.to(model.device)
print(f"model is loaded on device {ds_model.module.device}")

[2023-02-14 11:22:12,847] [INFO] [logging.py:68:log_dist] [Rank -1] DeepSpeed info: version=0.7.0, git-hash=unknown, git-branch=unknown
[2023-02-14 11:22:12,848] [INFO] [logging.py:68:log_dist] [Rank -1] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
Installed CUDA version 11.0 does not match the version torch was compiled with 11.3 but since the APIs are compatible, accepting this combination
Using /home/kiho/.cache/torch_extensions/py39_cu113 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/kiho/.cache/torch_extensions/py39_cu113/transformer_inference/build.ninja...
Building extension module transformer_inference...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module transformer_inference...
Time to load transformer_inference op: 2.8810627460479736 seconds
[2023-02-14 11:22:20,959] [INFO] [logging.py:68:log_

In [6]:
assert isinstance(ds_model.module.transformer.h[0], DeepSpeedTransformerInference) == True, "Module not sucessfully initialized"

In [9]:
example = "GET /twitter/login HTTP/1.1"
input_ids = tokenizer(example,return_tensors="pt").input_ids.to(ds_model.module.device)
logits = ds_model.generate(input_ids, do_sample=True, top_p=0.95, max_length=100)
print(tokenizer.decode(logits[0].tolist()))

GET /twitter/login HTTP/1.1
Host: api.twitter.com
User-Agent: Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm+ Bing/2.0)
Accept: */*
Proxy-Connection: Keep-Alive
X-UA-Compatible: IE=Edge,chrome=1
Cookie: a=s%3D%253


In [8]:
example = "https://auth-server.com/auth?"
input_ids = tokenizer(example,return_tensors="pt").input_ids.to(ds_model.module.device)
logits = ds_model.generate(input_ids, do_sample=True, top_p=0.95, max_length=100)
print(tokenizer.decode(logits[0].tolist()))

https://auth-server.com/auth?state=9&code=123-456-789-0123-4567-89012-3456789-0123456789&grant_type=authorization_code&redirect_uri=http://redirect.redirect-uri.com.

With this setup and all in order we should be able to log in with:

http://www.example.com/auth/authorize


In [15]:
payload = (
    "# Import re and define a regular expression that matches an email address"
)
print(f'Payload sequence length is: {len(tokenizer(payload)["input_ids"])}')

# generation arguments
generation_args = dict(do_sample=False, num_beams=1, min_length=30, max_new_tokens=128)
ds_results = measure_latency(ds_model, tokenizer, payload, generation_args, ds_model.module.device)

print(f"DeepSpeed model: {ds_results[0]}")
# Payload sequence length is: 128
# DeepSpeed model: P95 latency (ms) - 6577.044982599967; Average latency (ms) - 6569.11 +\- 6.57;


Payload sequence length is: 13
DeepSpeed model: latency (ms) - 3316.9458935037255; Average latency (ms) - 3308.97 +\- 7.42;
