# install Clgen 0.4.1

**We recommend you to create a new virtualenv to install Clgen.**

## step 1. install clang
```
sudo apt install clang
sudo apt update
```

## step 2. build clgen
```
# if you have an NVIDIA GPU with CUDA 8.0 and cuDNN:
(clgen)$ curl -s https://raw.githubusercontent.com/ChrisCummins/clgen/0.4.1/install-cuda.sh | bash
# CPU only:
(clgen)$ curl -s https://raw.githubusercontent.com/ChrisCummins/clgen/0.4.1/install-cpu.sh | bash
```

## step 3. install required libraries and install clgen
```
# install required libraries
pip install -r requirement.txt
# install clgen
python setup.py install
```

# Source Rewriter

In [1]:
code = """//#define Elements
__kernel void memset_kernel(__global char * mem_d, short val, int number_bytes){
    const int thread_id = get_global_id(0);
    mem_d[thread_id] = val;
}"""
print(code)

//#define Elements
__kernel void memset_kernel(__global char * mem_d, short val, int number_bytes){
    const int thread_id = get_global_id(0);
    mem_d[thread_id] = val;
}


In [2]:
from clgen._preprocess import preprocess

rewritten = preprocess(code)
print(rewritten)

__kernel void A(__global char* a, short b, int c) {
  const int d = get_global_id(0);
  a[d] = b;
}


# Souce Encoder

In [3]:
from clgen._atomizer import GreedyAtomizer
from clgen._langs import Language

lang = Language.from_str("opencl")
atomizer = GreedyAtomizer.from_text(lang, text=rewritten)
print(atomizer)

GreedyAtomizer[27 tokens]


In [4]:
import pandas as pd

pd.DataFrame(sorted([f"'{k}'" for k in atomizer.vocab]), columns=["token"])

Unnamed: 0,token
0,'\n'
1,' '
2,' '
3,'('
4,')'
5,'*'
6,"','"
7,'0'
8,';'
9,'='


In [5]:
encoded = atomizer.atomize(rewritten)
print(encoded)

[14  1 24  1 10  3 13  1 18  5  1 15  6  1 23  1 16  6  1 22  1 17  4  1
 25  0  2 19  1 22  1 20  1  9  1 21  3  7  4  8  0  2 15 11 20 12  1  9
  1 16  8  0 26]


In [6]:
for i in encoded:
    t = atomizer.deatomize([i])
    if t == '\n': t = '\\n'
    print(f"<{t}>", end="")

<__kernel>< ><void>< ><A><(><__global>< ><char><*>< ><a><,>< ><short>< ><b><,>< ><int>< ><c><)>< ><{><\n><  ><const>< ><int>< ><d>< ><=>< ><get_global_id><(><0><)><;><\n><  ><a><[><d><]>< ><=>< ><b><;><\n><}>

# Padding

In [7]:
from keras.preprocessing.sequence import pad_sequences

pad_val = atomizer.vocab_size
pad_sequences([encoded], maxlen=len(encoded) + 22, value=pad_val)[0]

Using TensorFlow backend.


array([27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
       27, 27, 27, 27, 27, 14,  1, 24,  1, 10,  3, 13,  1, 18,  5,  1, 15,
        6,  1, 23,  1, 16,  6,  1, 22,  1, 17,  4,  1, 25,  0,  2, 19,  1,
       22,  1, 20,  1,  9,  1, 21,  3,  7,  4,  8,  0,  2, 15, 11, 20, 12,
        1,  9,  1, 16,  8,  0, 26], dtype=int32)

# Build Vocabulary

In [8]:
import pandas as pd

srcs = '\n'.join(pd.read_csv("/data/ncc_data/opencl/raw/amd.csv")['src'].values)
print("lines of code:", len(srcs.split('\n')))

lines of code: 45497


In [9]:
lang = Language.from_str("opencl")
derived_atomizer = GreedyAtomizer.from_text(lang, text=srcs)
print("derived vocabulary:", derived_atomizer)

derived vocabulary: GreedyAtomizer[128 tokens]


In [10]:
pd.set_option('display.max_rows', 10)
pd.DataFrame(sorted([f"'{k}'" for k in derived_atomizer.vocab]), columns=["token"])

Unnamed: 0,token
0,'\n'
1,' '
2,' '
3,'!'
4,'%'
...,...
123,'y'
124,'z'
125,'{'
126,'|'
