In [None]:
#https://pypi.org/project/sentencepiece/
#http://ethen8181.github.io/machine-learning/deep_learning/subword/bpe.html

In [1]:
from csv import reader
from tqdm import tqdm #inline progress bar (quality of life)
import sentencepiece as spm

In [2]:
#move data to location for analizing
def make_data(file):
    # open file in read mode
    number_of_lines = 0
    num_chars = 0 #TODO
    with open(file, 'r') as read_obj:
        print('loading data from ',file,'...')
        # pass the file object to reader() to get the reader object
        csv_reader = reader(read_obj)
        # Iterate over each row in the csv using reader object

        data = []
        for row in tqdm(csv_reader):
            try:
                # row variable is a list that represents a row in csv
                #row[0] - code
                #row[1] - comment

                number_of_lines+=row[0].count('\n')

                #way to handle white spaces:
                #space #do first, 
                new_code = row[0].replace(' ',' SPACE')
                #newline
                new_code = new_code.replace('\n',' NEWLINE')
                #tab
                new_code = new_code.replace('\t',' TAB')

                #TODO
                #FILTER FOR UNICODE THING


                #save new data
                
                #both code and comment
                row = [new_code,row[1]]
                
                data.append(row)

            #https://stackoverflow.com/questions/4166070/python-csv-error-line-contains-null-byte
            #https://intellipaat.com/community/18827/how-to-delete-only-one-row-in-csv-with-python
            except:
                csv_reader.remove(row)


        read_obj.close()

    print('num samples: ', len(data))
    print('num lines: ', number_of_lines)
    
    return data,number_of_lines

def save_data(data):
    with open('data.txt', 'w') as filehandle:
        for i in tqdm(range(len(data))):
            filehandle.write('%s\n' % data[i][0])
            
            
class Tokenizer:

    #def __init__(self, filepath='python_tokenizer.model'):
    def __init__(self, filepath):
        self.sp = spm.SentencePieceProcessor(model_file=filepath)

    def encode(self, text, t=int):
        return self.sp.encode(text, out_type=t)

    def decode(self, pieces):
        return self.sp.decode(pieces)

    @staticmethod
    def train(input_file='data/raw_sents.txt', model_prefix='sp_model', vocab_size=30522,number_of_lines=10000):
        spm.SentencePieceTrainer.train(input=input_file, model_prefix=model_prefix, vocab_size=vocab_size,
                                       #input_sentence_size=2 ** 16, shuffle_input_sentence=True)
                                       input_sentence_size=number_of_lines, shuffle_input_sentence=True)
        
        
#TRAIN A TOKENIZER
def train_tokenizer(name):
    #get data
    file = 'code-comment-'+name+'.csv'
    data, number_of_lines = make_data(file)
    save_data(data)

    #train
    Tokenizer.train(input_file='data.txt', model_prefix=name+'_tokenizer', vocab_size=10000, number_of_lines=number_of_lines) #model_prefix is model storage name


In [13]:
#lang='py'
#lang='cpp'
lang='java'
#lang='all'

In [5]:
for i in ['py','cpp','java','all']:
    for j in ['short','medium','long']:
        print('trianing...',i,j)
        train_tokenizer(j+'_'+i)
        

21608it [00:00, 216078.15it/s]

trianing... py short
loading data from  code-comment-short_py.csv ...


384274it [00:01, 222010.64it/s]
 35%|███▍      | 133479/384274 [00:00<00:00, 1334778.54it/s]

num samples:  384274
num lines:  1379956


100%|██████████| 384274/384274 [00:00<00:00, 1333802.81it/s]
30040it [00:00, 147329.49it/s]

trianing... py medium
loading data from  code-comment-medium_py.csv ...


1090472it [00:07, 152365.00it/s]
 16%|█▋        | 177371/1090472 [00:00<00:01, 885453.86it/s]

num samples:  1090472
num lines:  6496023


100%|██████████| 1090472/1090472 [00:01<00:00, 841240.25it/s]
6976it [00:00, 69757.74it/s]

trianing... py long
loading data from  code-comment-long_py.csv ...


355397it [00:04, 77217.66it/s]
 15%|█▌        | 53366/355397 [00:00<00:00, 533654.15it/s]

num samples:  355397
num lines:  4717632


100%|██████████| 355397/355397 [00:00<00:00, 494003.61it/s]
45681it [00:00, 217830.88it/s]

trianing... cpp short
loading data from  code-comment-short_cpp.csv ...


1579746it [00:06, 243431.72it/s]
 16%|█▌        | 255899/1579746 [00:00<00:01, 1272512.08it/s]

num samples:  1579746
num lines:  2593695


100%|██████████| 1579746/1579746 [00:01<00:00, 1340261.45it/s]
12873it [00:00, 128722.45it/s]

trianing... cpp medium
loading data from  code-comment-medium_cpp.csv ...


648940it [00:04, 135410.09it/s]
 13%|█▎        | 81244/648940 [00:00<00:00, 812438.84it/s]

num samples:  648940
num lines:  3623966


100%|██████████| 648940/648940 [00:00<00:00, 784037.38it/s]
15037it [00:00, 74968.28it/s]

trianing... cpp long
loading data from  code-comment-long_cpp.csv ...


161779it [00:02, 78422.70it/s]
 32%|███▏      | 51949/161779 [00:00<00:00, 519463.25it/s]

num samples:  161779
num lines:  2203111


100%|██████████| 161779/161779 [00:00<00:00, 529823.54it/s]
51457it [00:00, 249726.59it/s]

trianing... java short
loading data from  code-comment-short_java.csv ...


2234085it [00:08, 256416.36it/s]
  7%|▋         | 148340/2234085 [00:00<00:01, 1483383.73it/s]

num samples:  2234085
num lines:  5200971


100%|██████████| 2234085/2234085 [00:01<00:00, 1511671.06it/s]
28001it [00:00, 140260.38it/s]

trianing... java medium
loading data from  code-comment-medium_java.csv ...


1030612it [00:07, 138948.89it/s]
  8%|▊         | 79099/1030612 [00:00<00:01, 790958.70it/s]

num samples:  1030612
num lines:  5912465


100%|██████████| 1030612/1030612 [00:01<00:00, 831215.26it/s]
13408it [00:00, 66679.23it/s]

trianing... java long
loading data from  code-comment-long_java.csv ...


281307it [00:03, 71926.38it/s]
 18%|█▊        | 51207/281307 [00:00<00:00, 512064.38it/s]

num samples:  281307
num lines:  3757412


100%|██████████| 281307/281307 [00:00<00:00, 476286.95it/s]
47671it [00:00, 240130.91it/s]

trianing... all short
loading data from  code-comment-short_all.csv ...


4198103it [00:16, 248348.42it/s]
  3%|▎         | 137191/4198103 [00:00<00:02, 1371898.22it/s]

num samples:  4198103
num lines:  9174622


100%|██████████| 4198103/4198103 [00:03<00:00, 1383857.86it/s]
15952it [00:00, 159509.12it/s]

trianing... all medium
loading data from  code-comment-medium_all.csv ...


2770022it [00:17, 160578.02it/s]
  3%|▎         | 93199/2770022 [00:00<00:02, 931982.00it/s]

num samples:  2770022
num lines:  16032454


100%|██████████| 2770022/2770022 [00:03<00:00, 909118.78it/s]
15095it [00:00, 73538.17it/s]

trianing... all long
loading data from  code-comment-long_all.csv ...


798481it [00:10, 78555.88it/s]
  4%|▍         | 33511/798481 [00:00<00:02, 335087.15it/s]

num samples:  798481
num lines:  10678155


100%|██████████| 798481/798481 [00:01<00:00, 480101.66it/s]


In [10]:
#ues examples

#detokenize
def decode_tokenized_code_snippet(tokens):
    decoded = tokenizer.decode(tokens)
    #decode but still has the added strings
    #print(decoded)
    token_string = ''.join(decoded)
    token_string = token_string.replace('SPACE',' ')
    token_string = token_string.replace('NEWLINE','\n')
    token_string = token_string.replace('TAB','\t')
    #print(token_string)
    return token_string

#instantiate tokenizer model
tokenizer = Tokenizer(lang+'_tokenizer.model')

example_code = data[1][0]
#tokenize code (ie encode)
#with words
tokens= tokenizer.encode(data[1][0],t=str)
#with numbers
tokens_nums = tokenizer.encode(example_code)
print(decode_tokenized_code_snippet(tokens))

for i in zip(tokens,tokens_nums):
    print(i)


int  FREAK_Impl::descriptorSize()  const 
{ 
        return  FREAK_NB_PAIRS  /  8;  //  descriptor  length  in  bytes 
} 

('▁int', 1209)
('▁SPACE', 3)
('FR', 2518)
('E', 27)
('AK', 2228)
('_', 5)
('Impl', 724)
('::', 18)
('descriptor', 1948)
('Size', 156)
('()', 20)
('▁SPACEconst', 19)
('▁NEWLINE', 4)
('{', 13)
('▁NEWLINE', 4)
('▁SPACE', 3)
('▁SPACE', 3)
('▁SPACE', 3)
('▁SPACEreturn', 24)
('▁SPACE', 3)
('FR', 2518)
('E', 27)
('AK', 2228)
('_', 5)
('NB', 1924)
('_', 5)
('PA', 2238)
('IR', 1152)
('S', 58)
('▁SPACE', 3)
('/', 92)
('▁SPACE', 3)
('8;', 923)
('▁SPACE', 3)
('//', 117)
('▁SPACEdescriptor', 1689)
('▁SPACElength', 434)
('▁SPACEin', 80)
('▁SPACEbytes', 449)
('▁NEWLINE', 4)
('}', 14)
('▁NEWLINE', 4)


In [None]:
a=1
a = 1