In [None]:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
checkpoint = "Salesforce/codegen-2B-multi"
model = AutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncation=True, max_length=2048)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
device = 'cuda'

In [None]:
!unzip data.zip

Archive:  data.zip
   creating: content/data/
  inflating: content/data/trmm-krnl.cpp  
   creating: content/data/outputs/
   creating: content/data/outputs/fewshots/
   creating: content/data/outputs/.ipynb_checkpoints/
  inflating: content/data/madd-krnl.cpp  
  inflating: content/data/processed-vadd-krnl.cpp  
  inflating: content/data/processed-syrk-krnl.cpp  
  inflating: content/data/ewmm-krnl.cpp  
  inflating: content/data/processed-madd-krnl.cpp  
  inflating: content/data/processed-jacobi_1d-krnl.cpp  
  inflating: content/data/jacobi_1d-krnl.cpp  
   creating: content/data/.ipynb_checkpoints/
  inflating: content/data/syrk-krnl.cpp  
  inflating: content/data/processed-ewmm-krnl.cpp  
  inflating: content/data/processed-dotprod-krnl.cpp  
  inflating: content/data/trmm-opt-krnl.cpp  
  inflating: content/data/vadd-krnl.cpp  
  inflating: content/data/processed-trmm-opt-krnl.cpp  
  inflating: content/data/mm-krnl.cpp  
   creating: content/data/fewshots_examples/
  inflating

In [None]:
%cd /content/data

/content/data


In [None]:
!python ../process_data.py

# Few shot prompting with persona

In [None]:
fewshots_header = """
Consider the following input output pairs where input is a program and output is the program with High Level Synthesis pragmas inserted\n

Input:

void kernel_fdtd_2d(int tmax,int nx,int ny,double ex[60][80],double ey[60][80],double hz[60][80],double _fict_[40])
{
  int t;
  int i;
  int j;
  for (t = 0; t < 40; t++) {
    for (j = 0; j < 80; j++) {
      ey[0][j] = _fict_[t];
    }
    for (i = 1; i < 60; i++) {
      for (j = 0; j < 80; j++) {
        ey[i][j] = ey[i][j] - 0.5 * (hz[i][j] - hz[i - 1][j]);
      }
    }

    for (i = 0; i < 60; i++) {

      for (j = 1; j < 80; j++) {
        ex[i][j] = ex[i][j] - 0.5 * (hz[i][j] - hz[i][j - 1]);
      }
    }
    for (i = 0; i < 59; i++) {
      for (j = 0; j < 79; j++) {
        hz[i][j] = hz[i][j] - 0.7 * (ex[i][j + 1] - ex[i][j] + ey[i + 1][j] - ey[i][j]);
      }
    }
  }
}
Output:
#pragma ACCEL kernel

void kernel_fdtd_2d(int tmax,int nx,int ny,double ex[60][80],double ey[60][80],double hz[60][80],double _fict_[40])
{
  int t;
  int i;
  int j;
//#pragma scop

#pragma ACCEL PIPELINE auto{__PIPE__L0}

#pragma ACCEL TILE FACTOR=auto{__TILE__L0}

#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L0}
  for (t = 0; t < 40; t++) {

#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L0_0}
    for (j = 0; j < 80; j++) {
      ey[0][j] = _fict_[t];
    }

#pragma ACCEL PIPELINE auto{__PIPE__L0_1}

#pragma ACCEL TILE FACTOR=auto{__TILE__L0_1}

#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L0_1}
    for (i = 1; i < 60; i++) {

#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L0_1_0}
      for (j = 0; j < 80; j++) {
        ey[i][j] = ey[i][j] - 0.5 * (hz[i][j] - hz[i - 1][j]);
      }
    }

#pragma ACCEL PIPELINE auto{__PIPE__L0_2}

#pragma ACCEL TILE FACTOR=auto{__TILE__L0_2}

#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L0_2}
    for (i = 0; i < 60; i++) {

#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L0_2_0}
      for (j = 1; j < 80; j++) {
        ex[i][j] = ex[i][j] - 0.5 * (hz[i][j] - hz[i][j - 1]);
      }
    }

#pragma ACCEL PIPELINE auto{__PIPE__L0_3}

#pragma ACCEL TILE FACTOR=auto{__TILE__L0_3}

#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L0_3}
    for (i = 0; i < 59; i++) {

#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L0_3_0}
      for (j = 0; j < 79; j++) {
        hz[i][j] = hz[i][j] - 0.7 * (ex[i][j + 1] - ex[i][j] + ey[i + 1][j] - ey[i][j]);
      }
    }
  }
//#pragma endscop
}


"""

In [None]:
prompt = fewshots_header + """
Act as an expert in High Level Synthesis, insert High Level Synthesis pragma to the folloing program. Reason your choice of High Level Synthesis pragma in comment.
"""

In [None]:
print(prompt)


Consider the following input output pairs where input is a program and output is the program with High Level Synthesis pragmas inserted


Input:

void kernel_fdtd_2d(int tmax,int nx,int ny,double ex[60][80],double ey[60][80],double hz[60][80],double _fict_[40])
{
  int t;
  int i;
  int j;
  for (t = 0; t < 40; t++) {
    for (j = 0; j < 80; j++) {
      ey[0][j] = _fict_[t];
    }
    for (i = 1; i < 60; i++) {
      for (j = 0; j < 80; j++) {
        ey[i][j] = ey[i][j] - 0.5 * (hz[i][j] - hz[i - 1][j]);
      }
    }
    
    for (i = 0; i < 60; i++) {
      
      for (j = 1; j < 80; j++) {
        ex[i][j] = ex[i][j] - 0.5 * (hz[i][j] - hz[i][j - 1]);
      }
    }
    for (i = 0; i < 59; i++) {
      for (j = 0; j < 79; j++) {
        hz[i][j] = hz[i][j] - 0.7 * (ex[i][j + 1] - ex[i][j] + ey[i + 1][j] - ey[i][j]);
      }
    }
  }
}
Output:
#pragma ACCEL kernel

void kernel_fdtd_2d(int tmax,int nx,int ny,double ex[60][80],double ey[60][80],double hz[60][80],double _fict_[40])
{

In [None]:
programs = [
        'processed-mm-krnl.cpp',
        'processed-dotprod-krnl2.cpp',
        'processed-dotprod-krnl.cpp',
        'processed-ewmm-krnl.cpp',
        'processed-jacobi_1d-krnl.cpp',
        'processed-madd-krnl.cpp',
        'processed-syrk-krnl.cpp',
        'processed-trmm-krnl.cpp',
        'processed-trmm-opt-krnl.cpp',
        'processed-vadd-krnl.cpp'
]
context = {}
for program in programs:
  with open(program, 'r') as f:
    lines = f.readlines()
    context[program] = prompt+'Input:'+'\n'.join(lines) + "\nOutput:\n"

In [None]:
from tqdm import tqdm

In [None]:
model = model.to(device)

In [None]:
lens = [2000]
for len in lens:
    for program in tqdm(context):
        text = context[program]
        inputs = tokenizer(text, return_tensors="pt", truncation=True)
        attention_mask = inputs["attention_mask"].to(device)
        input_ids=inputs["input_ids"].to(device)
        completion = model.generate(input_ids=input_ids, max_length=len, attention_mask=attention_mask)
        output = tokenizer.decode(completion[0])
        output_idx = output.rfind("Output:\n")
        with open('./outputs/fewshots/' + program[program.find('processed'):program.find('.cpp')]+str(len)+'.out','w') as f:
            f.write(output[output_idx:])


  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 10%|█         | 1/10 [00:10<01:35, 10.63s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 20%|██        | 2/10 [00:20<01:22, 10.36s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 30%|███       | 3/10 [00:25<00:55,  7.88s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 40%|████      | 4/10 [00:31<00:42,  7.07s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 5/10 [00:39<00:37,  7.42s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 60%|██████    | 6/10 [00:45<00:27,  6.85s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 70%|███████   | 7/10 [00:59<00:27,  9.19s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 80%|████████  | 8/10 [01:07<00:17,  8.99s/it]Setting `pad_token

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!zip -r /content/data2.zip /content/data/


  adding: content/data/ (stored 0%)
  adding: content/data/processed-mm-krnl.cpp (deflated 44%)
  adding: content/data/trmm-krnl.cpp (deflated 73%)
  adding: content/data/processed-atax-medium_kernel.c (deflated 54%)
  adding: content/data/outputs/ (stored 0%)
  adding: content/data/outputs/fewshots/ (stored 0%)
  adding: content/data/outputs/fewshots/processed-vadd-krnl2000.out (deflated 23%)
  adding: content/data/outputs/fewshots/processed-trmm-krnl2000.out (deflated 43%)
  adding: content/data/outputs/fewshots/processed-syrk-krnl2000.out (deflated 52%)
  adding: content/data/outputs/fewshots/processed-dotprod-krnl2000.out (deflated 26%)
  adding: content/data/outputs/fewshots/processed-ewmm-krnl2000.out (deflated 34%)
  adding: content/data/outputs/fewshots/processed-trmm-opt-krnl2000.out (deflated 51%)
  adding: content/data/outputs/fewshots/processed-mm-krnl2000.out (deflated 42%)
  adding: content/data/outputs/fewshots/processed-madd-krnl2000.out (deflated 34%)
  adding: content

# Soft Prompting (APE)

In [None]:
input_programs = ['bicg-large_kernel.c', 'atax_kernel.c','atax-medium_kernel.c', '3mm_kernel.c', 'adi_kernel.c', 'doitgen_kernel.c', 'gemver_kernel.c',
                  'bicg-medium_kernel.c','correlation_kernel.c', 'fdtd-2d-large_kernel.c', 'fdtd-2d_kernel.c']

In [None]:
input = []
output = []
for i in range(len(input_programs)):
  program_name = input_programs[i]
  with open(f'fewshots_examples/{program_name}', 'r') as f:
    lines = f.readlines()
  input.append(''.join(lines))
  lines = [line for line in lines if '#pragma' not in line]
  output.append(''.join(lines))

In [None]:
input[0]

'#pragma ACCEL kernel\n\nvoid kernel_3mm(int ni,int nj,int nk,int nl,int nm,double E[40][50],double A[40][60],double B[60][50],double F[50][70],double C[50][80],double D[80][70],double G[40][70])\n{\n  int i;\n  int j;\n  int k;\n//#pragma scop\n/* E := A*B */\n  \n#pragma ACCEL PIPELINE auto{__PIPE__L0}\n  \n#pragma ACCEL TILE FACTOR=auto{__TILE__L0}\n  \n#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L0}\n  for (i = 0; i < 40; i++) {\n    \n#pragma ACCEL PIPELINE auto{__PIPE__L3}\n    \n#pragma ACCEL TILE FACTOR=auto{__TILE__L3}\n    \n#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L3}\n    for (j = 0; j < 50; j++) {\n      E[i][j] = 0.0;\n      \n#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L6}\n      for (k = 0; k < 60; ++k) {\n        E[i][j] += A[i][k] * B[k][j];\n      }\n    }\n  }\n/* F := C*D */\n  \n#pragma ACCEL PIPELINE auto{__PIPE__L1}\n  \n#pragma ACCEL TILE FACTOR=auto{__TILE__L1}\n  \n#pragma ACCEL PARALLEL FACTOR=auto{__PARA__L1}\n  for (i = 0; i < 50; i++) {\n    \n#pragma ACC

In [None]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.14.1
    Uninstalling openai-1.14.1:
      Successfully uninstalled openai-1.14.1
Successfully installed openai-0.28.0


In [None]:
import openai
openai.api_key = ''

In [None]:
eval_template = \
"""Instruction: [PROMPT]
Input: [INPUT]
Output: [OUTPUT]"""

In [None]:
prompt_gen_template = "I gave an instruction to a High Level Synthesis expert. Based on the instruction they produced the following input-output pairs:\n\n[full_DEMO]\n\nThe instruction was to [APE]"

In [None]:
from automatic_prompt_engineer import ape

result, demo_fn = ape.simple_ape(
    dataset=(input, output),
    eval_template=eval_template,
    eval_model='davinci-002',
    prompt_gen_model='davinci-002',
    num_prompts=15,
    eval_batch_size=500,
    eval_rounds=20,
    prompt_gen_template = prompt_gen_template
)

Generating prompts...
[GPT_forward] Generating 10 completions, split into 1 batches of size 2000


100%|██████████| 1/1 [00:01<00:00,  1.16s/it]


Model returned 10 prompts. Deduplicating...
Deduplicated to 10 prompts.
Evaluating prompts...


Evaluating prompts:  90%|█████████ | 18/20 [00:30<00:03,  1.72s/it]

Rate limit reached for organization org-q5MbHIrUtti7qAqtgnkbZCH5 on tokens per min (TPM): Limit 250000, Used 114467, Requested 153360. Please try again in 4.278s. Visit https://platform.openai.com/account/rate-limits to learn more.
Retrying...


Evaluating prompts: 100%|██████████| 20/20 [00:39<00:00,  1.96s/it]

Finished evaluating.





In [None]:
print(result.prompts[0])

 produce accelerated kernels. The accelerator instruction causes the kernel to be run using special acceleration techniques and to produce the parallel reduction information. The information is stored in a reduction list. The input-output pairs are generated by the compiler's automatic parallelisation and scheduling.




In [None]:
prompt = fewshots_header + "/n" + result.prompts[0]

In [None]:
programs = [
        'processed-mm-krnl.cpp',
        'processed-dotprod-krnl2.cpp',
        'processed-dotprod-krnl.cpp',
        'processed-ewmm-krnl.cpp',
        'processed-jacobi_1d-krnl.cpp',
        'processed-madd-krnl.cpp',
        'processed-syrk-krnl.cpp',
        'processed-trmm-krnl.cpp',
        'processed-trmm-opt-krnl.cpp',
        'processed-vadd-krnl.cpp'
]
context = {}
for program in programs:
  with open(program, 'r') as f:
    lines = f.readlines()
    context[program] = prompt+'Input:'+'\n'.join(lines) + "\nOutput:\n"

In [None]:
lens = [2000]
for len in lens:
    for program in tqdm(context):
        text = context[program]
        inputs = tokenizer(text, return_tensors="pt", truncation=True)
        attention_mask = inputs["attention_mask"].to(device)
        input_ids=inputs["input_ids"].to(device)
        completion = model.generate(input_ids=input_ids, max_length=len, attention_mask=attention_mask)
        output = tokenizer.decode(completion[0])
        output_idx = output.rfind("Output:\n")
        with open('./outputs/ape/' + program[program.find('processed'):program.find('.cpp')]+str(len)+'.out','w') as f:
            f.write(output[output_idx:])

  0%|          | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 10%|█         | 1/10 [00:09<01:27,  9.68s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 20%|██        | 2/10 [00:20<01:21, 10.13s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 30%|███       | 3/10 [00:25<00:54,  7.81s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 40%|████      | 4/10 [00:31<00:42,  7.14s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 50%|█████     | 5/10 [00:40<00:38,  7.75s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 60%|██████    | 6/10 [00:46<00:28,  7.17s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 70%|███████   | 7/10 [01:08<00:36, 12.22s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 80%|████████  | 8/10 [01:17<00:22, 11.11s/it]Setting `pad_token

# Finetuning

In [1]:
!pip install boto3


Collecting boto3
  Downloading boto3-1.34.64-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.35.0,>=1.34.64 (from boto3)
  Downloading botocore-1.34.64-py3-none-any.whl (12.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.1-py3-none-any.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.2/82.2 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.34.64 botocore-1.34.64 jmespath-1.0.1 s3transfer-0.10.1


In [2]:
import boto3

BUCKET_NAME = 'cs259project'
s3 = boto3.resource('s3', aws_access_key_id = '',
                          aws_secret_access_key= '')

In [3]:
s3.Bucket(BUCKET_NAME).download_file('rkirby-nemo-aligner.hls-rm-001.tar.gz', 'rkirby-nemo-aligner.hls-rm-001.tar.gz')

In [4]:
!pip install udocker
!udocker --allow-root install

Collecting udocker
  Downloading udocker-1.3.13-py2.py3-none-any.whl (118 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/118.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m112.6/118.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.6/118.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: udocker
Successfully installed udocker-1.3.13
Info: creating repo: /root/.udocker
Info: udocker command line interface 1.3.13
Info: searching for udockertools >= 1.2.11
Info: installing udockertools 1.2.11
Info: installation of udockertools successful


In [None]:
!udocker --allow-root run hello-world



Info: keyboard interrupt


In [5]:
!udocker --allow-root load -i rkirby-nemo-aligner.hls-rm-001.tar.gz

Info: adding layer: d7f57ad09d27fe4b60916f041ad1a3771b71a09168ad930aaf9f0432dcdfa546
Info: adding layer: aab3aa7fa37bf9f7434d7c491b34ec18920b6ea294d9ce9640f40a3162daa21d
Info: adding layer: 522d9e0aa06cfdc0f8a246159be46d0cee3174a33a145c9696084dd3b8c28cec
Info: adding layer: 82b7e323093fdf3ec4a1395cbac330e94fe39386a1d86d642f7e42e95fb31121
Info: adding layer: c7b4a070076adcd37051dd6255c5191f14c1b53c136db71c7b7e833949198aa1
Info: adding layer: d2b6758f8cae5523342b1bcc2452f7b60f69931641fede58f62e077edbd8fee9
Info: adding layer: 73696fdda370e396ce3def1af7b5966a2e13cd02486dc7b7b4ecdbbba7643486
Info: adding layer: 8348759e8f23ca47dcdc065b5e535778c9a3a8230e0d8ce9499d8b5109ca3c8c
Info: adding layer: 5f130bb0a04adb1bee458e3b52bde8de2512280044e9b473436e8a090691ca20
Info: adding layer: 9aecbd3dece1ae0428ea99796166df51fcb27a2d979c78f473441398360c653b
Info: adding layer: 557572fd8f5ec2b6dcf18e726ba410bbfe67b4f1728846f58abc73a3148fcc28
Info: adding layer: 9960c45766b6d978f94f410cc770a9421f5bb9c2aa0a9

In [6]:
!udocker --allow-root images

REPOSITORY
nvcr.io/nvidian/rkirby-nemo-aligner:hls-rm    .


In [7]:
!udocker --allow-root run nvcr.io/nvidian/rkirby-nemo-aligner:hls-rm

Error: create container: getting layers or json
Error: manifest not found or not authorized
Error: no files downloaded
Error: create container: getting layers or json
Error: image or container not available


In [None]:
!udocker --allow-root setup  --nvidia nvcr.io/nvidian/rkirby-nemo-aligner

Error: invalid container id
