# Imports

In [7]:
import random
from random import sample
random.seed(10)

from csv import reader
from tqdm import tqdm #inline progress bar (quality of life)
import sentencepiece as spm

#SAVE DATA FOR BiLSTM

import json 

import ast
import tokenize
import io

In [53]:

#move data to location for analizing
def make_data(file):
    # open file in read mode
    number_of_lines = 0
    num_chars = 0 #TODO
    with open(file, 'r') as read_obj:
        print('loading data from ',file,'...')
        # pass the file object to reader() to get the reader object
        csv_reader = reader(read_obj)
        # Iterate over each row in the csv using reader object

        data = []
        for row in tqdm(csv_reader):
            try:
                # row variable is a list that represents a row in csv
                #row[0] - code
                #row[1] - comment

                number_of_lines+=row[0].count('\n')

                #way to handle white spaces:
                #space #do first, 
                new_code = row[0].replace(' ',' SPACE')
                #newline
                new_code = new_code.replace('\n',' NEWLINE')
                #tab
                new_code = new_code.replace('\t',' TAB')

                #TODO
                #FILTER FOR UNICODE THING


                #save new data
                row = [new_code,row[1]]
                data.append(row)

            #https://stackoverflow.com/questions/4166070/python-csv-error-line-contains-null-byte
            #https://intellipaat.com/community/18827/how-to-delete-only-one-row-in-csv-with-python
            except:
                csv_reader.remove(row)


        read_obj.close()

    print('num samples: ', len(data))
    print('num lines: ', number_of_lines)
    
    return data,number_of_lines


class Tokenizer:

    #def __init__(self, filepath='python_tokenizer.model'):
    def __init__(self, filepath):
        self.sp = spm.SentencePieceProcessor(model_file=filepath)

    def encode(self, text, t=int):
        return self.sp.encode(text, out_type=t)

    def decode(self, pieces):
        return self.sp.decode(pieces)

    @staticmethod
    def train(input_file='data/raw_sents.txt', model_prefix='sp_model', vocab_size=30522,number_of_lines=10000):
        spm.SentencePieceTrainer.train(input=input_file, model_prefix=model_prefix, vocab_size=vocab_size,
                                       #input_sentence_size=2 ** 16, shuffle_input_sentence=True)
                                       input_sentence_size=number_of_lines, shuffle_input_sentence=True)

#detokenize
def decode_tokenized_code_snippet(tokens):
    decoded = tokenizer.decode(tokens)
    #decode but still has the added strings
    #print(decoded)
    token_string = ''.join(decoded)
    token_string = token_string.replace('SPACE',' ')
    token_string = token_string.replace('NEWLINE','\n')
    token_string = token_string.replace('TAB','\t')
    #print(token_string)
    return token_string

#print(decode_tokenized_code_snippet(tokens))



def generate_code_from_data_no_edge(data,num_samples):
    #
    code_indixes = list(range(1,len(data)))
    sampled_indexes = sample(code_indixes,num_samples)

    generated_code = []
    generated_code_ids = []
    break_locations = []
    for i in sampled_indexes:
        
        #get raw code
        #sampled_code_snippet = data[i][0]+str('\n') #just the code, thus the 0
        
        #tokenize it 
        #with words
        sampled_code_snippet = tokenizer.encode(data[i][0],t=str)
        sampled_code_snippet_ids = tokenizer.encode(data[i][0])
        
        
        generated_code+=sampled_code_snippet
        if len(break_locations)<num_samples-1:
            break_locations.append(len(generated_code)-1)
        
    return generated_code, break_locations, generated_code_ids


def generate_valid_code_from_data_no_edge(data,num_samples):
    
    #valid segment search params
    trys = 0
    max_trys = 100
    
    #
    code_indixes = list(range(1,len(data)))
    sampled_indexes = sample(code_indixes,num_samples)

    generated_code = []
    generated_code_ids = []
    break_locations = []
    
    while len(break_locations)<num_samples:
        
        #get code sample from data
        new_index = random.choice(code_indixes)
        sampled_code_snippet = tokenizer.encode(data[new_index][0],t=str)
        sampled_code_snippet_ids = tokenizer.encode(data[new_index][0])
        
        #prep for validation
        token_string = generated_code+sampled_code_snippet
        token_string = ''.join(token_string)
        token_string = token_string.replace('SPACE',' ')
        token_string = token_string.replace('NEWLINE','\n')
        token_string = token_string.replace('TAB','\t')
        
        if check_code_validity(token_string):
            
            #save the relivent info
            generated_code+=sampled_code_snippet
            if len(break_locations)<num_samples:
                break_locations.append(len(generated_code)-1)  
                  
        else:
            #reset the test snippet
            temp_tokenized_code=[]
            trys+=1
        
    return generated_code, break_locations[:-1], generated_code_ids


def centered_sliding_window(token_list, window_diamiter,encode=False,PAD='unk'):
    windows = []
    for i in range(len(token_list)):
        
        #print(token_list)
        #input()
        
        window = []
        
        #if we have to pad the begining
        if i < window_diamiter:
            before_len = window_diamiter-i
            before = [PAD]*before_len+token_list[0:i]
        else:
            before = token_list[i-window_diamiter:i]
        
        #if we have to pad the end
        if i+window_diamiter>=len(token_list):
            after_len = (i+1+window_diamiter)-len(token_list)
            after = token_list[i+1:i+1+window_diamiter]+[PAD]*after_len

        else:
            after = token_list[i+1:i+1+window_diamiter]
        
        #put it togeather
        #print('------')
        #print('before:',before)
        #print('center:',token_list[i])
        #print('after:',after)
        window = before + [token_list[i]] + after
        #for encoding code if we want
        if encode:
            new_window = []
            #print(window)
            #input()
            for i in window:
                encoded = tokenizer.encode(i)
                if len(encoded)>1:       
                    x=encoded[1]
                    if type(x)==list:
                        new_window.append(x[0])
                    else:
                        new_window.append(x)
                elif len(encoded)==1:
                    if type(encoded)==list:
                        new_window.append(encoded[0])
                    else:
                        new_window.append(encoded)
                else:
                    #for some reason it finds the unicode stuff __
                    pass
                    #print(window)
                    #print(i)
                    #print(encoded)
                    #input()
            #print(window)
            #print(len(window))
            #print(len(tokenizer.decode(window)))
            #print(tokenizer.decode(window))
            #print(len(tokenizer.encode(window)))
            #window = tokenizer.encode(tokenizer.decode(window))
            window = new_window
        #print(window)
        #print(len(window))
        #input()

        #save windowz
        windows.append(window)
    
    return windows

#put comment in at break points
#https://www.tutorialspoint.com/python/list_insert.htm
#list.insert(index, obj)
def insert_comments(code, break_spots, comment='\n'+'*'*8+'\n',at_begining=True):
    #if there is a a comment at begining of snippet
    #if at_begining:
        #adds a notation to add a 0
        #at beigning of break spots too
    #    break_spots.insert(0,0)
    
    
    #go through breaks backwards
    #so as not to mess up break 
    #spots as we would if we went forward
    for b in break_spots[::-1]:
        code.insert(b,comment)
    return code


#code=['a','b','c','d']
#c = insert_comments(code,[2])
#print(c)

def make_data_points(num_segments_per_MSC = 5, window_diameter=20):
    
    
    #code, breaks, _ = generate_code_from_data(data,3) #WORKS
    #code, breaks, _ =generate_code_from_data_mark_begiging(data,num_segments_per_MSC)
    #code, breaks, _ = generate_code_from_data_no_edge(data,num_segments_per_MSC)
    code, breaks, _ = generate_valid_code_from_data_no_edge(data,num_segments_per_MSC)
    #code, breaks, _ = generate_code_from_data_mark_begiging(data,3)
    #code, breaks = generate_aligned_code_from_data_2(data,3) #left in old dir, in code segmentation files (should be 2 of them)
    #code, breaks = generate_verified_code_from_data(data,3)  #left in old dir, in code segmentation files (should be 2 of them)
    
    #do it with the code tokens
    wd= window_diameter
    X_windows = centered_sliding_window(code,wd,encode=True)
    
    #do it with the ground truth 
    y = [0]*len(code)
    for b in breaks:
        y[b] = 1
    Y_windows = centered_sliding_window(y,wd,encode=False,PAD=0)
    #print(Y_windows)
    
    return X_windows, Y_windows

def make_generated_code_dataset(num_snippets, name):
    # Data to be written 
    LSMT_DATA = {}
    for i in tqdm(range(num_snippets)):
        x,y = make_data_points()
        LSMT_DATA[str(i)] = {'x':x,'y':y}

    with open(name+"_LSTM_DATA.json", "w") as outfile: 
        json.dump(LSMT_DATA, outfile)
        
def check_code_validity(code_snippet):
    valid=False
    try:
        code_snippet = decode_tokenized_code_snippet(code_snippet)
        tokens =  tokenize.tokenize(io.BytesIO(code_snippet.encode('utf-8')).readline)
        for t in tokens:
            #this is enough to trigger it
            pass
        valid=True
    except Exception as e:
        pass
        #print(e)
    return valid


In [47]:
#get data
data, number_of_lines = make_data('code-comment-short_py.csv')
#instantiate tokenizer model
tokenizer = Tokenizer('short_py_tokenizer.model')

47256it [00:00, 227909.30it/s]

loading data from  code-comment-short_py.csv ...


384274it [00:01, 223973.76it/s]

num samples:  384274
num lines:  1379956





In [57]:
#EAMPLES and TESTS


#tokenize code (ie encode)
#with words
#tokens = tokenizer.encode(data[1][0],t=str)
#with numbers
tokens = tokenizer.encode(data[1][0])
print(tokens)

'''
num_segments_per_MSC = 10
code, breaks, _ = generate_code_from_data_mark_begiging(data,num_segments_per_MSC)
#do it with the code tokens
wd=20 #window diameter
X_windows = centered_sliding_window(code,wd,encode=True)
#print(X_windows)
#print(len(X_windows))
'''

#example
'''
wd=3#window diameter
l = ['a','b','c','d','e']#,'f','g','h','i','j','k','l']
windows = centered_sliding_window(l,wd,encode=False)
for w in windows:
    #print(len(w))
    print('center:',w[int(len(w)/2)],'- window: ',w)
'''

#simplest tokenizer example
#tokenizer.decode(201)

####
####ALIGNMENT ISSUES, UNCOMMENT THE 'good' AND 'bad' PRINT,
####SEE SOEMTIMES WHEN UNICODE CHAR IS THERE IT MESSES UP
####

num_segments_per_MSC = 5
code, breaks, _ = generate_valid_code_from_data_no_edge(data,num_segments_per_MSC)
#code, breaks, _ = generate_code_from_data_no_edge(data,num_segments_per_MSC)
#code, breaks, _ = generate_code_from_data_mark_begiging(data,num_segments_per_MSC)
#code, breaks, _ = generate_code_from_data(data,3)
#code, breaks = generate_aligned_code_from_data_2(data,3)
#code, breaks = generate_verified_code_from_data(data,3)
print('--------------------------------')
print('code:')
print('--------------------------------')
decoded = tokenizer.decode(code)
token_string = ''.join(decoded)
token_string = token_string.replace('SPACE',' ')
token_string = token_string.replace('NEWLINE','\n')
token_string = token_string.replace('TAB','\t')
print(token_string)
#print(code)
'''
for i in range(len(code)):
    if i in breaks:
        print(code[i],'------')
    else:
        print(code[i])
'''
print('--------------------------------')
print('break spots: ',breaks)
print('--------------------------------')
print('code with break spots indicated:')
print('--------------------------------')
#insert_comments
comments_added = insert_comments(code,breaks)
comments_added_decoded = tokenizer.decode(comments_added)
comments_added_token_string = ''.join(comments_added_decoded)
comments_added_token_string = comments_added_token_string.replace('SPACE',' ')
comments_added_token_string = comments_added_token_string.replace('NEWLINE','\n')
comments_added_token_string = comments_added_token_string.replace('TAB','\t')
print(comments_added_token_string)
print('--------------------------------')


[3, 11, 3, 5, 16, 5, 3909, 7, 6, 10, 3, 50, 9, 4, 3, 3, 3, 3, 4, 3, 3, 3, 12, 3, 6, 15, 34, 8, 16, 7, 50, 10, 3, 27, 13, 4, 4]
--------------------------------
code:
--------------------------------
TEMPLATE_FILE  =  "testcircuits/index_template.html" 
        def  accept_comment(self,  comment_id): 
                 
                raise  NotImplementedError() 
        def  connected(self): 
                 
                return  self._connected 
 
 class  FlowBuilderTransportError(XDDError): 
         
 
class  FlowBuilderTransportForwardingServer(SocketServer.TCPServer): 
        def  pack(self,  obj): 
                 
                return  pickle.dumps(self._box(obj),  protocol  =  PICKLE_PROTOCOL) 
 

--------------------------------
break spots:  [18, 56, 89, 123]
--------------------------------
code with break spots indicated:
--------------------------------
TEMPLATE_FILE  =  "testcircuits/index_template.html"
********
 
        def  accept_comment(self,  comment_id): 

In [58]:
#lang='py'
#lang='cpp'
#lang='java'
#lang='all'

for i in ['py']:#,'cpp','java']:
    for j in ['short','medium','long']:
        print('trianing...',i,j)
        name = j+'_'+i
        file = 'code-comment-'+name+'.csv'
        #get data
        data, number_of_lines = make_data(file)
        #instantiate tokenizer model
        tokenizer = Tokenizer(name+'_tokenizer.model')
        #gereate the dataset for the lstm
        make_generated_code_dataset(10000, name)




11101it [00:00, 68105.54it/s]

trianing... py short
loading data from  code-comment-short_py.csv ...


384274it [00:01, 230968.34it/s]
  0%|          | 3/10000 [00:00<06:41, 24.90it/s]

num samples:  384274
num lines:  1379956


100%|██████████| 10000/10000 [06:32<00:00, 25.45it/s]
17808it [00:00, 178079.75it/s]

trianing... py medium
loading data from  code-comment-medium_py.csv ...


1090472it [00:06, 175684.14it/s]
  0%|          | 2/10000 [00:00<12:06, 13.77it/s]

num samples:  1090472
num lines:  6496023


100%|██████████| 10000/10000 [12:48<00:00, 13.02it/s]
18955it [00:00, 93047.39it/s]

trianing... py long
loading data from  code-comment-long_py.csv ...


355397it [00:03, 95311.11it/s]
  0%|          | 1/10000 [00:00<24:09,  6.90it/s]

num samples:  355397
num lines:  4717632


100%|██████████| 10000/10000 [25:50<00:00,  6.45it/s] 
