In [1]:
#unified_with_date

In [2]:
from nltk.tokenize import MWETokenizer, TweetTokenizer, WordPunctTokenizer 
from nltk.tokenize import word_tokenize
import re

VALUES = ['>>>=', '>>=', '<<=',  '%=', '^=', '|=', '&=', '/=',
                  '*=', '-=', '+=', '<<', '--', '++', '||', '&&', '!=',
                  '>=', '<=', '==', '%', '^', '|', '&', '/', '*', '-',
                  '+', ':', '?', '~', '!', '<', '>', '=', '...', '->', '::', '\\', '\\\\', '\*', '*\\', '\\\\\\']
INFIX = ['||', '&&', '|', '^', '&', '==', '!=', '<', '>', '<=', '>=', 
             '<<', '>>', '>>>', '+', '-', '*', '/', '%']
PREFIX = ['++', '--', '!', '~', '+', '-']
POSTFIX = ['++', '--']
ASSIGNMENT = ['=', '+=', '-=', '*=', '/=', '&=', '|=', '^=', '%=', '<<=', '>>=', '>>>=']
LAMBDA = ['->']
COMMENT = ['//', '/*', '*/']
METHOD_REFERENCE = ['::']

WHITESPACE_DICT = {"                ":'<|16-s|>', 
                   "            ":'<|12-s|>', 
                   "        ":'<|8-s|>', 
                   "    ":'<|4-s|>', 
                   "  ":'<|2-s|>', " ":'<|s|>',
                   "\t\t\t\t":'<|4-t|>',"\t\t\t":'<|3-t|>',"\t\t":'<|2-t|>',"\t":'<|t|>',"\n":'<|nl|>'}

WHITESPACES = [WHITESPACE_DICT[x] for x in WHITESPACE_DICT]

CUSTOM_TOKEN = ["<|startcode|>", "<|endcode|>", "<|startfocus|>", "<|endfocus|>", "<|startcomment|>", "<|endcomment|>", 
                   "<|stringliteral|>", "<|singlelinecomment|>", "<|multilinecomment|>", "<|del|>"]

values = list(set(INFIX+PREFIX+POSTFIX+ASSIGNMENT+LAMBDA+COMMENT+METHOD_REFERENCE))
token_phrases = []
word_punct_tokenizer = WordPunctTokenizer()
for tok in CUSTOM_TOKEN+WHITESPACES:
    temp = tuple(word_punct_tokenizer.tokenize(tok))
    token_phrases.append(temp)
for w in values:
    temp = tuple(MWETokenizer().tokenize(w))
    if len(temp) > 1:
        token_phrases.append(temp)

word_punct_tokenizer = WordPunctTokenizer()
tweet_tokenizer = TweetTokenizer()
mwe_tokenizer = MWETokenizer(token_phrases, separator="")

def state(c):
    n = ord(c)
    if n>=97 and n<=122: # lower case
        return 1
    elif n>=65 and n<=90: # upper case
        return 2
    elif n>=48 and n<=57: # numbers
        return 3
    elif c.isspace(): # whitespaces
        return 4
    elif c in ['_', '$']: 
        return 5
    elif n < 128:
        return 6
    else:
        return 7

def space_up(s):
    if s is None or s == "":
        return ""
    new_s = s[0]
    for i in range(1,len(s)):
        prev_state = state(s[i-1])
        curr_state = state(s[i])
        if prev_state in [1,2] and curr_state in [3]:
            new_s += " "
        elif prev_state in [1] and curr_state in [2]:
            new_s += " "
        elif prev_state in [3] and curr_state in [1,2]:
            new_s += " "
        elif prev_state in [1,2,3] and curr_state in [5]:
            new_s += " "
        elif prev_state in [5] and curr_state in [1,2,3]:
            new_s += " "
        new_s+=s[i]
    return new_s

def white_space_tokenize(s):
    for x in WHITESPACE_DICT:
        s = s.replace(x, WHITESPACE_DICT[x])
    for key in WHITESPACE_DICT:
        val = WHITESPACE_DICT[x]
        s = s.replace(val, " "+val+" ")
    return s

def extreme_tokenization(comment):
    comment = white_space_tokenize(comment)
    comment = space_up(comment)
    tokenized = tweet_tokenizer.tokenize(comment)
    tokenized = word_punct_tokenizer.tokenize(' '.join(tokenized))
    tokenized = mwe_tokenizer.tokenize(tokenized)
    tokenized_comment = ' '.join(tokenized)
    tokenized_comment = re.sub(r'[^\x00-\x7f]',r'', tokenized_comment)
    tokenized = tokenized_comment.split()
    return tokenized

In [None]:
from os import path
import json
import re
import subprocess
import nltk
from pprint import pprint
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
nltk.download('punkt')

def getJsonData(JsonFile):
    with open(JsonFile, encoding="utf8") as f:
        data = json.load(f)
    return data
def read(path):
    return open(path, 'r', encoding = 'utf-8').read()


BASE="E:/codes/"
unique_data = getJsonData('E:\\unified_with_date.json')
data = unique_data[1020]
code_file_name = data['file_name']
base_patch_number = data['base_patch_number']
main_code_path = BASE+code_file_name+'/'+str(base_patch_number)+'.java'
main_code = read(main_code_path)


def get_source_target(file1, file2, line_number, change_window_size=5):
    source_target = subprocess.check_output(['java', '-jar', 'ChangedLineF.jar', \
                                file1, file2, str(line_number), str(change_window_size)]).decode("utf-8")
    (source, target) = source_target.split("<|source-target-sep|>")
    return source, target

def end_scope(start_index, lines):
    #print(lines)
    counter = 0
    end_index = -1
    found = False
    for index in range(start_index, len(lines), 1):
        #print(lines[index])
        #print(counter)
        for char in lines[index]:
            if(char=='{'):
                counter+=1
                found = True
            elif(char == "}"):
                counter-=1
                found = True
            if(counter==0 and found):
                end_index = index
                break
        if end_index!=-1:
            break
    return end_index

unique_data_processed = []
MAX_NUM_TOKEN = 800
UP_TOKEN = MAX_NUM_TOKEN//2
c = 0
ec = 0
errors = []
for idx, data in enumerate(unique_data):#[2428:2429]#[1000:1001]
    c+=1
    code_file_name = data['file_name']
    base_patch_number = data['base_patch_number']
    changed_patch_number = data['changed_patch_number']
    main_code_path = BASE+code_file_name+'/'+str(base_patch_number)+'.java'
    changed_code_path = BASE+code_file_name+'/'+str(changed_patch_number)+'.java'
    line_no = data['line_number']
    main_code = read(main_code_path)
    changed_code = read(changed_code_path)
    sample = main_code
    message = data['message']
    comment_id = data['comment_id']
    
    try:
        source, target = get_source_target(main_code_path, changed_code_path,line_no-1)
    except:
        error = {}
        error['main_code'] = main_code
        error['changed_code'] = changed_code
        error['line_no'] = line_no
        ec+=1
        if(main_code!=''):
            errors.append(error)
        print("falied to store unique_data= ", c, "error count", ec)    
    '''
    if len(errors)>100:
        with open('errors.json', 'w', encoding="utf8") as f:  # writing JSON object
            json.dump(errors, f)
        break
    '''
    #print(source)
    #print(len(errors))
    #print(ec)
    #continue
    sample = source
    #print(source)
    #for i in range(10):
    #    sample = re.sub(r"""\/\*([\S\s]+?)\*\/""", "", sample)
    
    data_dic = {}
    
    #data_dic['source'] = source
    data_dic['message'] = message
    
    data_dic['comment_id'] = comment_id
    data_dic['target'] = target
    data_dic['source'] = source
    data_dic['idx'] = c
    func_list = []
    data_dic['code_snippet'] = ""
    data_dic['prime_var_dic'] = {}
    #print(sample)
    classes = re.findall(r"(?:(public\s))?(class)\s([^\n\s]*)", sample)
    lines = sample.split('\n')
    class_list = []
    sp_start_func= -1
    sp_end_func= len(sample.split('\n'))   
    for cls_tpl in classes:
        s = "".join(cls_tpl[-1])
        s = s.strip()
        class_list.append(s) ## class list
        cls = s
        #print(s)
        start_index = -1
        for i in range(len(lines)):
            if cls in lines[i]:
                start_index = i
                break
        #print(start_index)
        end_index = end_scope(start_index, lines)
        #print(end_index)
        class_scope_dic = {}
        for i in range(start_index, end_index+1):
            class_scope_dic[i] = True
        funcs = re.findall(r"(public|protected|private|static|\s) +([\w\<\>\[\]]+\s+(\w+)) *\([^\)]*\) *(\{?|[^;])", "\n".join(lines[start_index:end_index+1]))
        func_scopes = []
        #print(funcs)
        #continue
        visited_func = set()
        for func in funcs:
            #print(func)
            if(func[-1]!= "{"):
                continue
            fc = list(func)
            fc = fc[-3]
            s = "".join(fc)
            s = s.strip()
            #func_list.append(s) ## function list
            #print(func_list)
            
            for index in range(start_index, end_index+1, 1):
                if s in lines[index]:
                    start_func = index
                    end_func = end_scope(index, lines)
                    
                    func_scopes.append((start_func,end_func))
                    if s not in visited_func:
                        visited_func.add(s)
                        func_list.append(lines[index].replace("{", "").strip())
                    
                    if(start_func<=line_no<=end_func):
                        special_func = "\n".join(lines[start_func:end_func+1])
                        #data_dic['code_snippet'] = special_func
                        #nonlocal sp_start_func
                        sp_start_func= start_func
                        #nonlocal sp_end_func
                        sp_end_func= end_func
        
        for func_sc in func_scopes:
            for i in range(func_sc[0], func_sc[1]+1):
                class_scope_dic[i] = False
        prime_var_list = []
        dic_vars = {}
        for i in range(start_index, end_index+1):
            if class_scope_dic[i]==True:
                #prime_vars = re.findall(r""""[^"]*"|((?=_[a-z_0-9]|[a-z])[a-z_0-9]+((?=\s*=)))""",lines[i])
                if '(' not in lines[i] and "return" not in lines[i] and "extends" not in lines[i]:
                    prime_vars = re.findall(r"""(\w+\s+)([a-zA-Z_][a-zA-Z0-9_]*)""", lines[i])
                    if len(prime_vars)==2:
                        if(len(prime_vars[1])==2 and prime_vars[1][1] in dic_vars.keys()):
                            dic_vars[prime_vars[1][1]]+=1
                        elif len(prime_vars[1])==2:
                            dic_vars[prime_vars[1][1]] =1
        #if(len(list(dic_vars.keys()))>0):
            #print(dic_vars)
        data_dic['prime_var_dic'] = dic_vars
    data_dic['class_list'] = class_list
    data_dic['func_list'] = func_list
    
    #print("specials   ###", sp_start_func, sp_end_func)
    
    if not (0<len(extreme_tokenization(data_dic['code_snippet']))<MAX_NUM_TOKEN): 
        up_count = 0
        up_done = False
        down_count = 0
        down_done = False
        splitted = sample.split("\n")
        while(1):
            #print(max(0, line_no-up_count),line_no+1)
            if len(extreme_tokenization("\n".join(splitted[max(0,sp_start_func, line_no-up_count):line_no+1])))<UP_TOKEN:
                if(up_count==line_no):
                    break
                up_count+=1
            else:
                break
                #print("something is wrong")
                      
        while(1):
            if len(extreme_tokenization("\n".join(splitted[(line_no-up_count):min(line_no+down_count+1,sp_end_func, len(splitted))])))<MAX_NUM_TOKEN:
                if(down_count==len(splitted)):
                    break
                down_count+=1
            else:
                break
    if sp_start_func != -1:
        #data_dic['code_snippet'] = "\n".join(splitted[sp_start_func:sp_end_func+1])
        
        if("startfocus" in splitted[sp_start_func-1]):
            sp_start_func-=1
        if sp_end_func <=len(splitted)-2 and "endfocus" in splitted[sp_end_func+1]:
            sp_end_func+=2
        if sp_end_func <=len(splitted)-1 and "endfocus" in splitted[sp_end_func]:
            sp_end_func+=1
        #print(" ".join(splitted[sp_start_func:sp_end_func+1]))
        #print(sp_start_func)
        #print(sp_end_func)
        code_snippet = "\n".join(splitted[sp_start_func:sp_end_func+1])
        print("in function")
        
    else:
        #data_dic['code_snippet'] = "\n".join(splitted[(line_no-up_count):(line_no+down_count+1)])
        print("not in function")
        if("startfocus" in splitted[(line_no-up_count-1)]):
            up_count+=1
        if line_no+down_count <=len(splitted)-1 and "endfocus" in splitted[line_no+down_count]:
            down_count+=1
        code_snippet = "\n".join(splitted[(line_no-up_count):(line_no+down_count+1)])
        #print(line_no-up_count)
        #print(line_no+down_count+1)
        #print(code_snippet)
    #print(code_snippet)
    #print(data_dic['func_list'])
    if "<|startfocus|>" in code_snippet and "<|endfocus|>" not in code_snippet:
        code_snippet =code_snippet+  "\n<|endfocus|>"
    if "<|startfocus|>" not in code_snippet and "<|endfocus|>" in code_snippet:
        code_snippet = "<|startfocus|>\n"+code_snippet
        
    data_dic['tokenized_code_snippet'] = extreme_tokenization(code_snippet)#data_dic['code_snippet'])
    data_dic['tokenized_target'] = extreme_tokenization(data_dic['target'])
    data_dic['tokenized_comment'] = extreme_tokenization(data_dic['message'])
    #print("specials   ###", sp_start_func, sp_end_func)
   
    
    #print(code_snippet)
    #print("#################")
    #print(data_dic)
    #print(source)
    #data_dic['actual_line_number'] = data['line_number']
    data_dic['base_code_line_number'] = data['line_number']
    data_dic['base_patch_number'] = data['base_patch_number']
    data_dic['changed_patch_number'] = data['changed_patch_number']
    data_dic['code_file_name'] = data['file_name']
    data_dic['comment_id'] = data['comment_id']
    data_dic['message'] = data['message']
    data_dic['line_change'] = data['line_change']
    data_dic['written_on'] = data['written_on']
    data_dic['project_name'] = data['project_name']
    if "<|startfocus|>" in code_snippet and "<|endfocus|>" in code_snippet:
        unique_data_processed.append(data_dic)
    print(idx)
    if(idx%1000==0):
        print(idx)
        print(c)
        pprint(data_dic)

'''
{'actual_line_number': 71, 'base_code_line_number': 73, 'base_patch_number': 3, 'changed_code': '    /*', 'changed_patch_number': 7, 'code_file_name': 'android_3478', 'comment_id': '3745e284_1e49cdaa', 'line_change': 2, 'message': '{@link android.icu.impl.OlsonTimeZone} ?', 'previous_code': '     /*'}
'''

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


in function
0
0
1
{'base_code_line_number': 64,
 'base_patch_number': 3,
 'changed_patch_number': 4,
 'class_list': ['SwaggerConfiguration'],
 'code_file_name': 'acumos_197',
 'code_snippet': '',
 'comment_id': 'a068aa5b_b404cd60',
 'func_list': ['public Docket api()', 'private ApiInfo apiInfo()'],
 'idx': 1,
 'line_change': 0,
 'message': 'One last occurrence - please change to acumos.org/...',
 'prime_var_dic': {},
 'project_name': 'acumos_',
 'source': 'E:\\pantho_Tufano_processing\r\n'
           '\n'
           ' * '
           ' * Acumos\n'
           ' * '
           ' * Copyright (C) 2017 AT&T Intellectual Property & Tech Mahindra. '
           'All rights reserved.\n'
           ' * '
           ' * This Acumos software file is distributed by AT&T and Tech '
           'Mahindra\n'
           ' * under the Apache License, Version 2.0 (the "License");\n'
           ' * you may not use this file except in compliance with the '
           'License.\n'
           ' * You may obtai

                            '<|s|>',
                            '//',
                            '<|nl|>',
                            '<|4-t|>',
                            '<|2-t|>',
                            '+',
                            '<|s|>',
                            '"',
                            '<|s|>',
                            'for',
                            '<|s|>',
                            'data',
                            '<|s|>',
                            'about',
                            '<|s|>',
                            'solutions',
                            ',',
                            '<|s|>',
                            'artifacts',
                            '<|s|>',
                            'and',
                            '<|s|>',
                            'users',
                            '.',
                            '"',
                            '//',
                            '<|nl|>',
                  

                      'net',
                      '/',
                      'wiki',
                      '/',
                      'spaces',
                      '/',
                      'CW',
                      '/',
                      'pages',
                      '/',
                      '233610',
                      '/',
                      'Data',
                      '+',
                      'Model',
                      '+',
                      'for',
                      '+',
                      'Machine',
                      '+',
                      'Learning',
                      '+',
                      'Platform',
                      '"',
                      ',',
                      '<|nl|>',
                      '<|4-t|>',
                      '<|2-t|>',
                      '"',
                      'cds',
                      '.',
                      'someday',
                      '@',
                      'acumos',
   

not in function
333
not in function
334
not in function
335
in function
336
in function
337
not in function
338
not in function
339
not in function
340
not in function
341
not in function
342
not in function
343
not in function
344
not in function
345
not in function
346
not in function
347
not in function
348
not in function
349
not in function
350
not in function
351
not in function
352
not in function
353
in function
354
not in function
355
in function
356
in function
357
not in function
358
in function
359
not in function
360
not in function
361
not in function
362
not in function
363
in function
364
falied to store unique_data=  366 error count 13
not in function
365
falied to store unique_data=  367 error count 14
not in function
366
falied to store unique_data=  368 error count 15
not in function
367
falied to store unique_data=  369 error count 16
not in function
368
falied to store unique_data=  370 error count 17
not in function
369
not in function
370
not in function
371
in 

falied to store unique_data=  706 error count 35
not in function
705
falied to store unique_data=  707 error count 36
not in function
706
falied to store unique_data=  708 error count 37
not in function
707
falied to store unique_data=  709 error count 38
not in function
708
not in function
709
not in function
710
falied to store unique_data=  712 error count 39
not in function
711
falied to store unique_data=  713 error count 40
not in function
712
falied to store unique_data=  714 error count 41
not in function
713
falied to store unique_data=  715 error count 42
not in function
714
falied to store unique_data=  716 error count 43
not in function
715
falied to store unique_data=  717 error count 44
not in function
716
falied to store unique_data=  718 error count 45
not in function
717
falied to store unique_data=  719 error count 46
not in function
718
not in function
719
not in function
720
not in function
721
in function
722
not in function
723
in function
724
in function
725
not 

           'import org.acumos.portal.be.util.EELFLoggerDelegate;\n'
           'import org.springframework.beans.factory.annotation.Autowired;\n'
           'import org.springframework.stereotype.Controller;\n'
           'import org.springframework.web.bind.annotation.PathVariable;\n'
           'import org.springframework.web.bind.annotation.RequestBody;\n'
           'import org.springframework.web.bind.annotation.RequestMapping;\n'
           'import org.springframework.web.bind.annotation.RequestMethod;\n'
           'import org.springframework.web.bind.annotation.ResponseBody;\n'
           '\n'
           'import io.swagger.annotations.ApiOperation;\n'
           '\n'
           '@Controller\n'
           '@RequestMapping(APINames.SITE_PATH)\n'
           'public class SiteContentServiceController extends '
           'AbstractController {\n'
           '\t\n'
           '\t@Autowired\n'
           '\tSiteContentService siteContentService;\n'
           '\t\n'
           '\tpriv

           '            \tdata.setResponseBody(content);\n'
           '                '
           'data.setErrorCode(JSONTags.TAG_ERROR_CODE_SUCCESS);\n'
           '                data.setResponseDetail("Carousel picture fetched '
           'successfully");\n'
           '            }\n'
           '        } catch (Exception e) {\n'
           '        \tdata.setErrorCode(JSONTags.TAG_ERROR_CODE);\n'
           '        \tdata.setResponseDetail("Exception Occurred Fetching '
           'Carousel Picture");\n'
           '            log.error(EELFLoggerDelegate.errorLogger, "Exception '
           'Occurred Fetching Carousel Picture", e);\n'
           '        }\n'
           '        return data;\n'
           '    }\n'
           '\n'
           '    @ApiOperation(value = "Updates carousel picture ", response = '
           'JsonResponse.class)\n'
           '    @RequestMapping(value = { APINames.UPDATE_CAROUSEL_PICTURE }, '
           'method = RequestMethod.POST, produces

                            'acumos',
                            '.',
                            'portal',
                            '.',
                            'be',
                            '.',
                            'common',
                            '.',
                            'Json',
                            'Response',
                            ';',
                            '<|nl|>',
                            'import',
                            '<|s|>',
                            'org',
                            '.',
                            'acumos',
                            '.',
                            'portal',
                            '.',
                            'be',
                            '.',
                            'service',
                            '.',
                            'Site',
                            'Content',
                            'Service',
                            ';',
  

                            'log',
                            '<|s|>',
                            '=',
                            '<|s|>',
                            'EELFLogger',
                            'Delegate',
                            '.',
                            'get',
                            'Logger',
                            '(',
                            'Site',
                            'Content',
                            'Service',
                            'Controller',
                            '.',
                            'class',
                            ');',
                            '<|nl|>',
                            '<|nl|>',
                            '<|startfocus|>',
                            '<|nl|>',
                            '<|4-s|>',
                            'public',
                            '<|s|>',
                            'Site',
                            'Content',
                            

                            'content',
                            ');',
                            '<|nl|>',
                            '<|16-s|>',
                            'data',
                            '.',
                            'set',
                            'Error',
                            'Code',
                            '(',
                            'JSONTags',
                            '.',
                            'TAG',
                            '_',
                            'ERROR',
                            '_',
                            'CODE',
                            '_',
                            'SUCCESS',
                            ');',
                            '<|nl|>',
                            '<|16-s|>',
                            'data',
                            '.',
                            'set',
                            'Response',
                            'Detail',
                          

                            ')',
                            '<|s|>',
                            '{',
                            '<|nl|>',
                            '<|8-s|>',
                            'log',
                            '.',
                            'debug',
                            '(',
                            'EELFLogger',
                            'Delegate',
                            '.',
                            'debug',
                            'Logger',
                            ',',
                            '<|s|>',
                            '"',
                            'get',
                            'Contact',
                            'Info',
                            '"',
                            ');',
                            '<|nl|>',
                            '<|8-s|>',
                            'MLPSite',
                            'Content',
                            '<|s|>',
                    

                      '.',
                      'set',
                      'Response',
                      'Body',
                      '(',
                      'content',
                      ');',
                      '<|nl|>',
                      '<|4-t|>',
                      'data',
                      '.',
                      'set',
                      'Error',
                      'Code',
                      '(',
                      'JSONTags',
                      '.',
                      'TAG',
                      '_',
                      'ERROR',
                      '_',
                      'CODE',
                      '_',
                      'SUCCESS',
                      ');',
                      '<|nl|>',
                      '<|4-t|>',
                      'data',
                      '.',
                      'set',
                      'Response',
                      'Detail',
                      '(',
                

In [None]:
with open('unique_data_processed_with_Date.json', 'w', encoding="utf8") as f:  # writing JSON object
    json.dump(unique_data_processed, f)

In [9]:
print(unique_data[0])

{'base_patch_number': 3, 'changed_patch_number': 4, 'comment_id': 'a068aa5b_b404cd60', 'file_name': 'acumos_197', 'line_change': 0, 'line_number': 64, 'message': 'One last occurrence - please change to acumos.org/...', 'project_name': 'acumos_', 'written_on': '2017-11-21 20:29:13'}
