In [8]:
import datetime
import matplotlib.pyplot as plt
import deepspeed
import nltk
import numpy as np
import os
import pandas as pd
import random
try:
    import google_re2 as re
except:
    import re
import seaborn as sns
import time
import torch
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import CodeGenPreTrainedModel, CodeGenTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
from deepspeed.ops.transformer.inference import DeepSpeedTransformerInference


### CODEGEN^B MULTI

In [3]:
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-6B-multi", cache_dir="/data/kiho/autocode/codegen/codegen6B_multi_cache/")
model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-6B-multi", cache_dir="/data/kiho/autocode/codegen/codegen6B_multi_cache/")

Downloading:   0%|          | 0.00/240 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/14.3G [00:00<?, ?B/s]

In [12]:
# init deepspeed inference engine
world_size = int(os.getenv('WORLD_SIZE', 2))
ds_model = deepspeed.init_inference(
    model=model,      # Transformers models
    mp_size=world_size,        # Number of GPU
    dtype=torch.float32, # dtype of the weights
    replace_method="auto", # Lets DS autmatically identify the layer to replace
    replace_with_kernel_inject=True, # replace the model with the kernel injector
)
ds_model.to(model.device)
model = ds_model.module
print(f"model is loaded on device {ds_model.module.device}")

[2023-02-09 17:35:43,610] [INFO] [logging.py:69:log_dist] [Rank 0] DeepSpeed info: version=0.5.8+9aa288d7, git-hash=9aa288d7, git-branch=HEAD
[2023-02-09 17:35:43,612] [INFO] [engine.py:127:_init_quantization_setting] quantize_bits = 8 mlp_extra_grouping = False, quantize_groups = 1
[2023-02-09 17:35:44,059] [INFO] [engine.py:91:__init__] Place model to device: 0
model is loaded on device cuda:0


In [25]:
example = "requests.get"
input_ids = tokenizer(example,return_tensors="pt").input_ids.to(ds_model.module.device)
logits = ds_model.generate(input_ids, do_sample=True, top_p=0.95, max_length=30)
print(tokenizer.decode(logits[0].tolist()))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


requests.get("http://192.168.1.101:8000/api/v1/task/10/"), 404)



In [8]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model.to(device)

CodeGenForCausalLM(
  (transformer): CodeGenModel(
    (wte): Embedding(51200, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): CodeGenBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (qkv_proj): Linear(in_features=4096, out_features=12288, bias=False)
          (out_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): CodeGenMLP(
          (fc_in): Linear(in_features=4096, out_features=16384, bias=True)
          (fc_out): Linear(in_features=16384, out_features=4096, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): CodeGenBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): CodeGenAttention(
          (attn_dropout): Dr

In [13]:
inputs = tokenizer("Import re and define a regular expression that matches an email address", return_tensors="pt").to(device)
sample = model.generate(**inputs, max_length=128)
print(tokenizer.decode(sample[0], truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Import re and define a regular expression that matches an email address.
#
# The regular expression must have two capturing groups: the local part and the
# domain name.  Use one of the two groups to extract the local part and the domain
# name.
#
# The domain name is a string that may contain any characters except the period
# (.).  Use the string to replace the period in the email address.
#
# The local part is a string that may contain any characters except the period
# (.), the at symbol (@), and the dot symbol (.).  Use the string to
