# BPE Tutorial

> This NB contains the code for training a BPE

In [None]:
import pandas as pd

from icodegen.data.core import *
from pathlib import Path
from sklearn.model_selection import train_test_split

In [None]:
path = Path('../dvc-icodegen')
df = pd.read_csv(path/'searchnet/[codesearchnet-java-1597073966.81902].csv', sep = '~', index_col = 0)
df.head()

In [None]:
len(df)

In [None]:
df_trn = df[df.partition == 'train'].copy()
df_val = df[df.partition == 'valid'].copy()
df_tst = df[df.partition == 'test'].copy()
print(len(df_trn))
df_trn.head()

In [None]:
assert ((pd.concat([df_tst, df_trn, df_val]) == df).all()).all()

In [None]:
df_new_trn, df_bpe = train_test_split(df_trn, test_size = 0.1) # 10% selected to match the Maybe Deep Neural Networks are the Best Choice for Modeling Source Code paper

In [None]:
len(df_new_trn), len(df_bpe)

In [None]:
df_bpe.partition = ['bpe'] * len(df_bpe)

In [None]:
df_bpe.head()

In [None]:
df_new = pd.concat([df_new_trn, df_bpe, df_val, df_tst], ignore_index = True)
df_new.head()

In [33]:
len(df_new[df_new.partition == 'train']), len(df_new[df_new.partition == 'bpe']), len(df_new[df_new.partition == 'valid']), len(df_new[df_new.partition == 'test'])

(409005, 45446, 15328, 26909)

In [32]:
df_non_ascii = remove_non_ascii(df_new)
df_replaced = replace_special_tokens(df_non_ascii, java_special_tokens)
len(df_replaced[df_new.partition == 'train']), len(df_replaced[df_replaced.partition == 'bpe']), len(df_replaced[df_replaced.partition == 'valid']), len(df_replaced[df_replaced.partition == 'test'])

(405507, 45039, 15021, 26716)

In [35]:
df_replaced.to_csv(path/'searchnet/[codesearchnet-java-1597073966.81902].csv', sep = '~', index = False)

In [36]:
tokenizer = train_tokenizer(df_replaced[df_replaced.partition == 'bpe'])
tokenizer.save(str(path/'bpe/tokenizer-java.json'))
tokenizer.get_vocab_size()

10000

In [39]:
from tokenizers import Tokenizer

new_tokenizer = Tokenizer.from_file(str(path/'bpe/tokenizer-java.json'))
new_tokenizer.encode(' <&&> <{><').tokens

['Ġ', '<&&>', 'Ġ', '<{>', 'Ġ<']

In [38]:
new_tokenizer.get_vocab()

{'ged': 3155,
 'decess': 8629,
 'UTE': 6167,
 'external': 8455,
 'ingleton': 6260,
 'Ġmaven': 9464,
 '8': 112,
 'lock': 779,
 'Ġlimit': 3246,
 'xff': 3949,
 'vileged': 5194,
 'Ġbs': 6677,
 'Mon': 1876,
 'raph': 1615,
 'render': 5678,
 'ium': 6407,
 'Closure': 5252,
 'ssert': 1774,
 'IfNot': 8177,
 'riter': 879,
 'UN': 1230,
 'Ġchildren': 3980,
 'ĉĉĉ': 376,
 'TOKEN': 3881,
 'Ġgraph': 4846,
 'UUID': 2674,
 'getPosition': 9600,
 'Orientation': 7732,
 '<{><': 1203,
 'STOM': 9092,
 'Ġlogging': 9411,
 'Trans': 1051,
 'Ġe': 314,
 'CHAN': 4835,
 'bucket': 4818,
 'Epoch': 9319,
 'iv': 952,
 'AU': 4224,
 'GnuP': 9889,
 'RL': 5599,
 'logging': 9859,
 'ĠLong': 1996,
 'Ġ"_': 5341,
 'Ġdeserialize': 8581,
 'Ġeps': 7750,
 ">'\\": 1936,
 'getRight': 8545,
 'Ġpacket': 5826,
 'Ġfail': 4025,
 'Day': 2121,
 'ITI': 7580,
 "Ġ'<(>'": 6877,
 'ArrayIndex': 7890,
 'Focus': 8742,
 'Future': 1517,
 'ĠSimpleDateFormat': 7697,
 'Ġdelegate': 5248,
 'Ġrelationship': 9041,
 'release': 5844,
 "<(>'</>": 7036,
 'types': 

In [None]:
# idx = 0
# df_beaut = beautify_code(df_trn, n = 10)
# df_replaced = replace_special_tokens(df_beaut, java_special_tokens)

# tokenizer = train_tokenizer(df_trn)
# encoded = tokenizer.encode(df_replaced.code.values[idx])
# print(df_replaced.code.values[idx])
# print('=' * 100)
# print(encoded.tokens)