In [1]:
from tqdm import tqdm

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
checkpoint = "Salesforce/codegen-2B-multi"
device = "mps"
model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto", offload_folder="offload")



In [3]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
prompt = """
Below is a program with High Level Synthesis pragma inserted:\n

#pragma ACCEL kernel

void kernel_symm(double alpha,double beta,double C[60][80],double A[60][60],double B[60][80])
{
  int i;
  int j;
  int k;
//BLAS PARAMS
//SIDE = 'L'
//UPLO = 'L'
// =>  Form  C := alpha*A*B + beta*C
// A is MxM
// B is MxN
// C is MxN
//note that due to Fortran array layout, the code below more closely resembles upper triangular case in BLAS
  
#pragma ACCEL PIPELINE auto{__PIPE__L0}
  
#pragma ACCEL TILE FACTOR=auto{__TILE__L0}
  
#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L0}
  for (i = 0; i < 60; i++) {
    
#pragma ACCEL PIPELINE auto{__PIPE__L1}
    
#pragma ACCEL TILE FACTOR=auto{__TILE__L1}
    
#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L1}
    for (j = 0; j < 80; j++) {
      double temp2 = ((double )0);
      
#pragma ACCEL PARALLEL reduction=temp2 FACTOR=auto{__PARA__L2}
      for (k = 0; k < 60; k++) {
        if (k < i) {
          C[k][j] += alpha * B[i][j] * A[i][k];
          temp2 += B[k][j] * A[i][k];
        }
      }
      C[i][j] = beta * C[i][j] + alpha * B[i][j] * A[i][i] + alpha * temp2;
    }
  }
}
Based on the example above, act as an expert in High Level Synthesis and apply High Level Synthesis(HLS) pragmas to the following program. Reason for your choice of HLS pragmas:\n
"""
programs = [
    './data/processed-2mm_kernel.c',
    './data/processed-bicg_kernel.c',
    './data/processed-covariance_kernel.c',
    './data/processed-gemm-blocked_kernel.c',
    './data/processed-gemm-p_kernel.c',
]
context = {}
for program in programs:
    with open(program, 'r') as f:
        lines = f.readlines()
        context[program] = prompt+'\n'.join(lines)


# Zero shot

In [5]:
lens = range(400,600,100)
for len in lens:
    for program in tqdm(context):
        text = context[program]
        inputs = tokenizer(text, return_tensors="pt")
        input_ids=inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        completion = model.generate(input_ids=input_ids, max_length=len, attention_mask=attention_mask)
        output = tokenizer.decode(completion[0])
        with open('./outputs/' + program[program.find('processed'):program.find('.c')]+str(len)+'.out','w') as f:
            f.write(output)
    

python(5107) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 10%|█         | 1/10 [00:20<03:06, 20.77s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 20%|██        | 2/10 [00:38<02:32, 19.01s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 30%|███       | 3/10 [00:51<01:55, 16.44s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 40%|████      | 4/10 [01:04<01:29, 14.87s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 5/10 [01:21<01:17, 15.53s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 60%|██████    | 6/10 [02:29<02:13, 33.49s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 70%|███████   | 7/10 [02:46<01:24, 28.18s/it]Setting `pad_token_id` to `eos_token_id

# One shot

In [14]:
lens = range(900,1000,100)
for len in lens:
    for program in tqdm(context):
        text = context[program]
        inputs = tokenizer(text, return_tensors="pt")
        input_ids=inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        completion = model.generate(input_ids=input_ids, max_length=len, attention_mask=attention_mask)
        output = tokenizer.decode(completion[0])
        with open('./outputs/oneshot1/' + program[program.find('processed'):program.find('.c')]+str(len)+'.out','w') as f:
            f.write(output)

  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 20%|██        | 1/5 [2:08:41<8:34:47, 7721.97s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 40%|████      | 2/5 [5:43:05<8:57:19, 10746.39s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 60%|██████    | 3/5 [5:45:05<3:16:27, 5893.98s/it] Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 60%|██████    | 3/5 [8:44:52<5:49:54, 10497.39s/it]


KeyboardInterrupt: 

In [32]:
!PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.7

python(5007) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
