# unified_with_date

In [1]:
from nltk.tokenize import MWETokenizer, TweetTokenizer, WordPunctTokenizer 
from nltk.tokenize import word_tokenize
import re

VALUES = ['>>>=', '>>=', '<<=',  '%=', '^=', '|=', '&=', '/=',
                  '*=', '-=', '+=', '<<', '--', '++', '||', '&&', '!=',
                  '>=', '<=', '==', '%', '^', '|', '&', '/', '*', '-',
                  '+', ':', '?', '~', '!', '<', '>', '=', '...', '->', '::', '\\', '\\\\', '\*', '*\\', '\\\\\\']
INFIX = ['||', '&&', '|', '^', '&', '==', '!=', '<', '>', '<=', '>=', 
             '<<', '>>', '>>>', '+', '-', '*', '/', '%']
PREFIX = ['++', '--', '!', '~', '+', '-']
POSTFIX = ['++', '--']
ASSIGNMENT = ['=', '+=', '-=', '*=', '/=', '&=', '|=', '^=', '%=', '<<=', '>>=', '>>>=']
LAMBDA = ['->']
COMMENT = ['//', '/*', '*/']
METHOD_REFERENCE = ['::']

WHITESPACE_DICT = {"                ":'<|16-s|>', 
                   "            ":'<|12-s|>', 
                   "        ":'<|8-s|>', 
                   "    ":'<|4-s|>', 
                   "  ":'<|2-s|>', " ":'<|s|>',
                   "\t\t\t\t":'<|4-t|>',"\t\t\t":'<|3-t|>',"\t\t":'<|2-t|>',"\t":'<|t|>',"\n":'<|nl|>'}

WHITESPACES = [WHITESPACE_DICT[x] for x in WHITESPACE_DICT]

REVERSE_WHITESPACE_DICT = {}
for key in WHITESPACE_DICT:
    value = WHITESPACE_DICT[key]
    REVERSE_WHITESPACE_DICT[value] = key

CUSTOM_TOKEN = ["<|startcode|>", "<|endcode|>", "<|startfocus|>", "<|endfocus|>", "<|startcomment|>", "<|endcomment|>", 
                   "<|stringliteral|>", "<|singlelinecomment|>", "<|multilinecomment|>", "<|del|>"]

values = list(set(INFIX+PREFIX+POSTFIX+ASSIGNMENT+LAMBDA+COMMENT+METHOD_REFERENCE))
token_phrases = []
word_punct_tokenizer = WordPunctTokenizer()
for tok in CUSTOM_TOKEN+WHITESPACES:
    temp = tuple(word_punct_tokenizer.tokenize(tok))
    token_phrases.append(temp)
for w in values:
    temp = tuple(MWETokenizer().tokenize(w))
    if len(temp) > 1:
        token_phrases.append(temp)

word_punct_tokenizer = WordPunctTokenizer()
tweet_tokenizer = TweetTokenizer()
mwe_tokenizer = MWETokenizer(token_phrases, separator="")

def state(c):
    n = ord(c)
    if n>=97 and n<=122: # lower case
        return 1
    elif n>=65 and n<=90: # upper case
        return 2
    elif n>=48 and n<=57: # numbers
        return 3
    elif c.isspace(): # whitespaces
        return 4
    elif c in ['_', '$']: 
        return 5
    elif n < 128:
        return 6
    else:
        return 7

def space_up(s):
    if s is None or s == "":
        return ""
    new_s = s[0]
    for i in range(1,len(s)):
        prev_state = state(s[i-1])
        curr_state = state(s[i])
        if prev_state in [1,2] and curr_state in [3]:
            new_s += " "
        elif prev_state in [1] and curr_state in [2]:
            new_s += " "
        elif prev_state in [3] and curr_state in [1,2]:
            new_s += " "
        elif prev_state in [1,2,3] and curr_state in [5]:
            new_s += " "
        elif prev_state in [5] and curr_state in [1,2,3]:
            new_s += " "
        new_s+=s[i]
    return new_s

def white_space_tokenize(s):
    for x in WHITESPACE_DICT:
        s = s.replace(x, WHITESPACE_DICT[x])
    for key in REVERSE_WHITESPACE_DICT:
        #val = REVERSE_WHITESPACE_DICT[x]
        s = s.replace(key, " "+key+" ")
    return s

def extreme_tokenization(comment):
    comment = white_space_tokenize(comment)
    comment = space_up(comment)
    tokenized = tweet_tokenizer.tokenize(comment)
    tokenized = word_punct_tokenizer.tokenize(' '.join(tokenized))
    tokenized = mwe_tokenizer.tokenize(tokenized)
    tokenized_comment = ' '.join(tokenized)
    tokenized_comment = re.sub(r'[^\x00-\x7f]',r'', tokenized_comment)
    tokenized = tokenized_comment.split()
    return tokenized

def extreme_detokenization(tokens):
    s = ""
    for token in tokens:
        if token in REVERSE_WHITESPACE_DICT:
            s+= REVERSE_WHITESPACE_DICT[token]
        else:
            s+= token
    return s

In [2]:
from os import path
import json
import re
import subprocess
import nltk
from pprint import pprint
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm
nltk.download('punkt')

def getJsonData(JsonFile):
    with open(JsonFile, encoding="utf8") as f:
        data = json.load(f)
    return data
def read(path):
    return open(path, 'r', encoding = 'utf-8').read()


BASE="C:/research_stuff/codes/"
unique_data = getJsonData('E:/APR/DATA/unique_data_with_date_and_index.json')
data = unique_data[1020]
code_file_name = data['file_name']
base_patch_number = data['base_patch_number']
main_code_path = BASE+code_file_name+'/'+str(base_patch_number)+'.java'
main_code = read(main_code_path)


def get_source_target(file1, file2, line_number, change_window_size=5):
    source_target = subprocess.check_output(['java', '-jar', 'ChangedLine_status.jar', \
                                file1, file2, str(line_number), str(change_window_size)]).decode("utf-8")
    (status, source, target) = source_target.split("<|sep|>")
    return status, source, target

def end_scope(start_index, lines):
    #print(lines)
    counter = 0
    end_index = -1
    found = False
    for index in range(start_index, len(lines), 1):
        #print(lines[index])
        #print(counter)
        for char in lines[index]:
            if(char=='{'):
                counter+=1
                found = True
            elif(char == "}"):
                counter-=1
                found = True
            if(counter==0 and found):
                end_index = index
                break
        if end_index!=-1:
            break
    return end_index

unique_data_processed = []
MAX_NUM_TOKEN = 400
UP_TOKEN = MAX_NUM_TOKEN//2
c = 0
ec = 0
errors = []
#for idx, data in enumerate(unique_data):#[2428:2429]#[1000:1001]

def process(data):    
    code_file_name = data['file_name']
    base_patch_number = data['base_patch_number']
    changed_patch_number = data['changed_patch_number']
    main_code_path = BASE+code_file_name+'/'+str(base_patch_number)+'.java'
    changed_code_path = BASE+code_file_name+'/'+str(changed_patch_number)+'.java'
    line_no = data['line_number']
    main_code = read(main_code_path)
    changed_code = read(changed_code_path)
    sample = main_code
    message = data['message']
    comment_id = data['comment_id']
    source = ""
    target=''
    try:
        status, source, target = get_source_target(main_code_path, changed_code_path,line_no)
    except:
        return
    
    sample = source
    data_dic = {}
    data_dic['status'] = status
    data_dic['message'] = message
    
    data_dic['comment_id'] = comment_id
    data_dic['target'] = target[:-2]
    #data_dic['source'] = source
    #data_dic['idx'] = c
    func_list = []
    data_dic['code_snippet'] = main_code
    data_dic['prime_var_dic'] = {}
    #print(sample)
    classes = re.findall(r"(?:(public\s))?(class)\s([^\n\s]*)", sample)
    lines = sample.split('\n')
    class_list = []
    sp_start_func= -1
    sp_end_func= len(sample.split('\n'))   
    for cls_tpl in classes:
        s = "".join(cls_tpl[-1])
        s = s.strip()
        class_list.append(s) ## class list
        cls = s
        #print(s)
        start_index = -1
        for i in range(len(lines)):
            if cls in lines[i]:
                start_index = i
                break
        #print(start_index)
        end_index = end_scope(start_index, lines)
        #print(end_index)
        class_scope_dic = {}
        for i in range(start_index, end_index+1):
            class_scope_dic[i] = True
        funcs = re.findall(r"(public|protected|private|static|\s) +([\w\<\>\[\]]+\s+(\w+)) *\([^\)]*\) *(\{?|[^;])", "\n".join(lines[start_index:end_index+1]))
        func_scopes = []
        #print(funcs)
        #continue
        visited_func = set()
        for func in funcs:
            #print(func)
            if(func[-1]!= "{"):
                continue
            fc = list(func)
            fc = fc[-3]
            s = "".join(fc)
            s = s.strip()
            #func_list.append(s) ## function list
            #print(func_list)
            
            for index in range(start_index, end_index+1, 1):
                if s in lines[index]:
                    start_func = index
                    end_func = end_scope(index, lines)
                    
                    func_scopes.append((start_func,end_func))
                    if s not in visited_func:
                        visited_func.add(s)
                        func_list.append(lines[index].replace("{", "").strip())

                    if(start_func<=line_no<=end_func):
                        special_func = "\n".join(lines[start_func:end_func+1])
                        #data_dic['code_snippet'] = special_func
                        #nonlocal sp_start_func
                        sp_start_func= start_func
                        #nonlocal sp_end_func
                        sp_end_func= end_func

        for func_sc in func_scopes:
            for i in range(func_sc[0], func_sc[1]+1):
                class_scope_dic[i] = False
        prime_var_list = []
        dic_vars = {}
        for i in range(start_index, end_index+1):
            if class_scope_dic[i]==True:
                #prime_vars = re.findall(r""""[^"]*"|((?=_[a-z_0-9]|[a-z])[a-z_0-9]+((?=\s*=)))""",lines[i])
                if '(' not in lines[i] and "return" not in lines[i] and "extends" not in lines[i]:
                    prime_vars = re.findall(r"""(\w+\s+)([a-zA-Z_][a-zA-Z0-9_]*)""", lines[i])
                    if len(prime_vars)==2:
                        if(len(prime_vars[1])==2 and prime_vars[1][1] in dic_vars.keys()):
                            dic_vars[prime_vars[1][1]]+=1
                        elif len(prime_vars[1])==2:
                            dic_vars[prime_vars[1][1]] =1
        #if(len(list(dic_vars.keys()))>0):
            #print(dic_vars)
        data_dic['prime_var_dic'] = dic_vars
    data_dic['class_list'] = class_list
    data_dic['func_list'] = func_list
    
    #print("specials   ###", sp_start_func, sp_end_func)
    
    #if not (0 < len(extreme_tokenization(data_dic['code_snippet']))<MAX_NUM_TOKEN): 
    #print("comes")
    up_count = 0
    up_done = False
    down_count = 0
    down_done = False
    splitted = sample.split("\n")
    while(1):
        #print(max(0, line_no-up_count),line_no+1)
        if len(extreme_tokenization("\n".join(splitted[max(0,sp_start_func, line_no-up_count):line_no+1])))<UP_TOKEN:
            if(up_count==line_no):
                break
            up_count+=1
        else:
            up_count-=1
            break
            #print("something is wrong")
    #print(len(extreme_tokenization("\n".join(splitted[max(0,sp_start_func, line_no-up_count):line_no+1]))))
    #print(up_count)
    while(1):
        if len(extreme_tokenization("\n".join(splitted[max(0,sp_start_func, line_no-up_count):min(line_no+down_count+1,sp_end_func, len(splitted))])))<MAX_NUM_TOKEN:
            #print(len(extreme_tokenization("\n".join(splitted[(line_no-up_count):min(line_no+down_count+1,sp_end_func, len(splitted))]))))
            #print(MAX_NUM_TOKEN)
            if(down_count==len(splitted)):
                break
            down_count+=1
        else:
            down_count-=1
            break
    #print(line_no+down_count+1,sp_end_func, len(splitted), up_count)
    code_snippet = "\n".join(splitted[max(0,sp_start_func, line_no-up_count):min(line_no+down_count+1,sp_end_func, len(splitted))])
    #print("length = ", len(extreme_tokenization(code_snippet)))
    '''
    #print(len(extreme_tokenization("\n".join(splitted[(line_no-up_count):min(line_no+down_count+1,sp_end_func, len(splitted))]))))
    #print(up_count, down_count)
    #print(max(0,sp_start_func, line_no-up_count),min(line_no+down_count+1,sp_end_func, len(splitted)))
    
    #print(line_no)
    #print(len(extreme_tokenization("\n".join(splitted[41-14:41+50]))))
    if sp_start_func != -1:
        #print("comes")
        #data_dic['code_snippet'] = "\n".join(splitted[sp_start_func:sp_end_func+1])
        
        if("startfocus" in splitted[sp_start_func-1]):
            sp_start_func-=1
        if sp_end_func <=len(splitted)-2 and "endfocus" in splitted[sp_end_func+1]:
            sp_end_func+=2
        if sp_end_func <=len(splitted)-1 and "endfocus" in splitted[sp_end_func]:
            sp_end_func+=1
        #print(" ".join(splitted[sp_start_func:sp_end_func+1]))
        #print(sp_start_func)
        #print(sp_end_func)
        code_snippet = "\n".join(splitted[sp_start_func:sp_end_func+1])
        #print("in function")
        
    else:
        #data_dic['code_snippet'] = "\n".join(splitted[(line_no-up_count):(line_no+down_count+1)])
        #print("not in function")
        if("startfocus" in splitted[(line_no-up_count-1)]):
            up_count+=1
        if line_no+down_count <=len(splitted)-1 and "endfocus" in splitted[line_no+down_count]:
            down_count+=1
        code_snippet = "\n".join(splitted[(line_no-up_count):(line_no+down_count+1)])
        #print(line_no-up_count)
        #print(line_no+down_count+1)
        #print(code_snippet)
    #print(code_snippet)
    #print(data_dic['func_list'])
    '''
    if "<|startfocus|>" in code_snippet and "<|endfocus|>" not in code_snippet:
        code_snippet =code_snippet+  "\n<|endfocus|>"
    if "<|startfocus|>" not in code_snippet and "<|endfocus|>" in code_snippet:
        code_snippet = "<|startfocus|>\n"+code_snippet
        
    data_dic['tokenized_code_snippet'] = extreme_tokenization(code_snippet)#data_dic['code_snippet'])
    data_dic['tokenized_target'] = extreme_tokenization(data_dic['target'])
    data_dic['tokenized_comment'] = extreme_tokenization(data_dic['message'])
    data_dic['code_snippet'] = code_snippet
    #print("specials   ###", sp_start_func, sp_end_func)

    
    #print(code_snippet)
    #print("#################")
    #print(data_dic)
    #print(source)
    #data_dic['actual_line_number'] = data['line_number']
    #data_dic['code_snippet'] = data['code_snippet']
    
    data_dic['global_index'] = data['global_index']
    data_dic['base_code_line_number'] = data['line_number']
    data_dic['base_patch_number'] = data['base_patch_number']
    data_dic['changed_patch_number'] = data['changed_patch_number']
    data_dic['code_file_name'] = data['file_name']
    data_dic['comment_id'] = data['comment_id']
    data_dic['message'] = data['message']
    data_dic['line_change'] = data['line_change']
    data_dic['written_on'] = data['written_on']
    data_dic['project_name'] = data['project_name']
    #print("length", len(data_dic['tokenized_code_snippet']))
    #print("token length",len(data_dic['tokenized_code_snippet']))
    #return data_dic
    
    
    if "<|startfocus|>" in code_snippet and "<|endfocus|>" in code_snippet:
        unique_data_processed.append(data_dic)
        if len(data_dic['tokenized_code_snippet'])>401:
            print("length greater than 401 found global idx", data['global_index'])
        if(len(unique_data_processed)%100==0):
            print("data size = ", len(unique_data_processed))
    '''
    print(idx)
    if(idx%1000==0):
        print(idx)
        print(c)
        pprint(data_dic)
    '''
'''
{'actual_line_number': 71, 'base_code_line_number': 73, 'base_patch_number': 3, 'changed_code': '    /*', 'changed_patch_number': 7, 'code_file_name': 'android_3478', 'comment_id': '3745e284_1e49cdaa', 'line_change': 2, 'message': '{@link android.icu.impl.OlsonTimeZone} ?', 'previous_code': '     /*'}
'''

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


"\n{'actual_line_number': 71, 'base_code_line_number': 73, 'base_patch_number': 3, 'changed_code': '    /*', 'changed_patch_number': 7, 'code_file_name': 'android_3478', 'comment_id': '3745e284_1e49cdaa', 'line_change': 2, 'message': '{@link android.icu.impl.OlsonTimeZone} ?', 'previous_code': '     /*'}\n"

In [None]:
dat = process(unique_data[25019])

In [None]:
for idx , dt in enumerate(unique_data):
    if(len(dt['tokenized_code_snippet']))>=400:
        #print(unique_data['global_index'])
        print(len(process(unique[unique_data[idx]['global_index']])['code_snippet']))
#process(unique_data[545])['code_snippet']

In [3]:
import sys,json
import csv
import _thread
import threading
import time
from time import sleep
import pandas as pd

import json
def getJsonData(JsonFile):
    with open(JsonFile, encoding="utf8") as f:
        data = json.load(f)
    return data
def read(path):
    return open(path, 'r', encoding = 'utf-8').read()


#BASE="E:/codes/"

file_size = len(unique_data)
batch_size = 1000

def translate(file_no):
    for x in range(file_no*batch_size, min(file_size,(file_no+1)*batch_size)):
        process(unique_data[x])

### Multithread Run

In [4]:
thrds = []
for i in range(0,(file_size//batch_size)+1):
    #translate(str(i))
    try:
        '''
        if(False):
           #_thread.start_new_thread(translate, (str(i)))
           print(str(i))
           my = M()
           td =  threading.Thread(target=M.t, args = (str(i),))
           td.start()
           thrds.append(td)
        else:
        '''
        print("processing file = ", str(i))
        td =  threading.Thread(target=translate, args = (i,))
        td.start()
        thrds.append(td)
       
    except:
        print("Error: unable to start thread")

for td in thrds:
    td.join()

processing file =  0
processing file =  1
processing file =  2
processing file =  3
processing file =  4
processing file =  5
processing file =  6
processing file =  7
processing file =  8
processing file =  9
processing file =  10
processing file =  11
processing file =  12
processing file =  13
processing file =  14
processing file =  15
processing file =  16
processing file =  17
processing file =  18
processing file =  19
processing file =  20
processing file =  21
processing file =  22
processing file =  23
processing file =  24
processing file =  25
processing file =  26
processing file =  27
processing file =  28
processing file =  29
processing file =  30
data size =  processing file = 100 31

processing file =  32
processing file =  33
processing file =  34
processing file =  35
processing file =  data size =  36
200
processing file =  37
processing file =  38
processing file =  39
processing file =  40
processing file =  41
processing file =  42
processing file =  43data size

data size =  75200
data size =  75300
data size =  75400
data size =  75500
data size =  75600
data size =  75700
data size =  75800
data size =  75900
data size =  76000
data size =  76100
data size =  76200
data size =  76300
data size =  76400
data size =  76500
data size =  76600
data size =  76700
data size =  76800
data size =  76900
data size =  77000
data size =  77100
data size =  77200
data size =  77300
data size =  77400
data size =  77500
data size =  77600
data size =  77700
data size =  77800
data size =  77900
data size =  78000
data size =  78100
data size =  78200
data size =  78300
data size =  78400
data size =  78500
data size =  78600
data size =  78700
data size =  78800
data size =  78900
data size =  79000
data size =  79100
data size =  79200
data size =  79300
data size =  79400
data size =  79500
data size =  79600
data size =  79700
data size =  79800
data size =  79900
data size =  80000
data size =  80100
data size =  80200
data size =  80300
data size = 

In [5]:
with open('unique_data_processed_with_Date_multithread_idx_400_feb_06_v2.json', 'w', encoding="utf8") as f:  # writing JSON object
    json.dump(unique_data_processed, f)
#12:20pm

In [6]:
len(unique_data_processed)
#unique_data_processed[-1]

95352

In [None]:
unique_data_processed[0].keys()

## Single Thread Run

In [None]:
unique_data_processed = []

In [None]:
for i in range(100):
    process(unique_data[i])

In [None]:
len(unique_data_processed)

## Testing Change Trigger Calculation

In [None]:
import json
def getJsonData(JsonFile):
    with open(JsonFile, encoding="utf8") as f:
        data = json.load(f)
    return data
def read(path):
    return open(path, 'r', encoding = 'utf-8').read()


BASE="C:/research_stuff/codes"
unique_data = getJsonData('E:/APR/DATA/unique_data_processed_with_Date_multithread_idx_400_feb_03.json')
print(unique_data[0])

In [None]:
data = getJsonData('unique_data_processed_with_Date_multithread.json')

In [None]:
print(data[0]['source'])
print("===================")
print(data[0]['target'])


In [None]:
unique_data[0]

In [None]:
i = 0

In [None]:
code_file_name = unique_data[i]['file_name']
base_patch_number = unique_data[i]['base_patch_number']
changed_patch_number = unique_data[i]['changed_patch_number']
main_code_path = BASE+code_file_name+'/'+str(base_patch_number)+'.java'
changed_code_path = BASE+code_file_name+'/'+str(changed_patch_number)+'.java'
line_no = unique_data[i]['line_number']
main_code = read(main_code_path)
changed_code = read(changed_code_path)
sample = main_code
message = unique_data[i]['message']
comment_id = unique_data[i]['comment_id']
source = ""
target=''
source, target = get_source_target(main_code_path, changed_code_path,line_no-1)

In [None]:
print(message)
print("######################")
print(target)
print("######################")
print(source)

In [None]:
i = 85

In [None]:
print(unique_data_processed[i]['message'])
print("##################################")
print(unique_data_processed[i]['target'])
print("##################################")
print(unique_data_processed[i]['source'])

In [None]:
code = unique_data_processed[30]
print(code)

In [None]:
main_code_path

In [None]:
line_no

In [None]:
unique_data_processed[0]['source']

In [None]:
unique_data_processed[0]['target'][:-2]

## Random Sample

In [None]:
import random

i = random.randint(0,len(unique_data))

unique_data_processed = []
process(unique_data[i])

print("Data Number: ", i)
print("##################################")
print(unique_data_processed[0]['status'])
print("##################################")
print(unique_data_processed[0]['message'])
print("##################################")
print(unique_data_processed[0]['target'])
print("##################################")
print(unique_data_processed[0]['code_snippet'])

In [None]:
i

In [None]:
print(unique_data_processed[0]['func_list'])

In [None]:
unique_data_processed[0]

In [None]:
unique_data2 = unique_data.copy()

In [None]:
i = 0
for i in range(len(unique_data2)):
    unique_data2[i]['global_index'] = i

In [None]:
unique_data2[100000]

In [None]:
with open('unique_data_with_date_and_index.json', 'w', encoding="utf8") as f:  # writing JSON object
    json.dump(unique_data2, f)

## Create Training Data, Removing Test Data

In [None]:
tufano_test_50 = [13257, 10443, 21152, 8344, 6123, 6122, 21086, 9006, 14137, 2523, 20560, 19915, 11901, 21693, 11107, 2862, 17879, 12346, 14420, 3304, 20658, 13924, 21678, 1945, 19047, 4809, 16525, 21206, 14443, 21481, 5892, 13829, 13432, 5099, 4386, 11997, 19914, 12053, 2593, 10330, 15305, 11388, 3721, 20969, 5081, 1698, 10571, 18025, 15679, 17847, 11928, 11603, 4889, 2275, 2486, 19685, 5751, 8624, 7797, 17666, 7386, 3637, 5322, 15897, 19141, 18152, 9304, 10875, 11265, 10798, 4037, 20348, 1523, 11467, 8681, 3425, 9065, 3218, 10253, 13353, 2653, 14120, 16091, 2254, 7186, 6130, 8591, 9508, 5079, 2852, 13805, 11996, 20144, 13898, 8802, 14704, 18353, 5013, 10893, 21737, 2146, 14135, 8558, 12660, 11119, 14087, 3666, 10010, 21986, 11362, 13777, 14591, 9078, 16886, 11084, 8497, 21973, 3012, 6120, 10343, 11429, 12975, 16540, 6691, 15168, 17916, 12147, 4926, 11437, 13825, 19745, 21110, 2824, 11167, 3949, 10816, 10579, 15102, 13452, 9525, 3361, 5752, 11120, 8512, 17598, 13134, 3389, 17098, 12098, 20887, 12875, 11401, 10811, 19452, 11042, 20528, 18278, 8801, 7395, 7634, 13026, 14750, 12598, 2134, 15328, 19022, 17539, 18903, 20678, 17284, 2698, 19401, 8613, 21682, 13847, 15378, 8760, 19787, 12348, 6266, 5039, 9425, 12811, 16796, 12824, 4066, 8307, 2391, 4808, 17597, 20049, 14315, 13013, 15698, 2851, 2778, 17500, 13803, 6042, 20000, 3333, 18905, 2113, 17724, 8840, 20665, 21543, 10030, 14929, 10877, 14961, 3141, 17846, 13344, 10153, 5940, 12649, 14890, 14759, 21421, 10709, 3897, 3007, 20834, 21107, 7277, 11720, 11016, 3132, 10841, 17746, 20477, 4076, 16242, 21175, 10139, 16452, 10316, 15003, 9490, 20271, 20888, 7093, 19349, 9007, 6101, 11181, 14823, 12253, 1996, 18145, 14751, 9972, 9596, 12088, 19747, 11083, 9567, 12603, 11750, 5546, 4447, 4172, 10231, 1650, 2514, 6627, 18772, 16202, 5078, 13562, 19487, 14082, 11099, 20145, 8017, 2859, 20371, 5300, 4534, 10822, 10089, 7806, 17574, 17865, 7658, 20993, 12764, 18782, 19060, 7917, 17068, 19092, 16548, 14880, 19254, 2046, 5105, 20178, 6343, 12439, 18995, 21093, 7637, 18155, 11978, 17798, 12613, 20820, 21919, 8003, 21297, 6125, 14429, 20179, 15169, 6131, 12874, 18760, 1402, 5474, 20450, 2408, 10968, 4168, 13885, 11620, 10380, 1518, 14043, 2093, 12839, 17671, 1764, 4088, 18904, 2433, 17884, 9189, 8978, 8953, 18781, 1467, 14927, 13354, 16361, 11101, 10873, 18126, 15104, 16655, 9019, 20718, 12065, 14791, 20213, 15955, 17754, 18763, 21618, 21132, 7185, 5684, 4455, 14855, 5415, 7214, 14058, 4021, 18969, 4820, 15936, 8065, 3724, 11624, 16604, 20508, 7767, 15567, 14923, 16698, 6243, 9487, 12865, 2864, 11176, 13093, 2860, 5379, 2879, 12283, 19458, 17392, 7718, 4585, 21436, 17247, 5705, 4735, 2386]
tufano_test_100 = [5465, 15764, 18264, 11017, 17135, 7848, 21592, 15676, 4869, 20105, 15796, 12010, 6956, 13149, 3673, 5991, 21519, 13763, 7374, 16092, 5043, 12551, 14357, 17860, 17187, 13954, 7030, 6202, 3467, 13696, 10996, 10205, 16827, 17144, 7037, 13859, 8467, 14869, 10480, 4893, 8712, 19648, 11063, 10037, 10857, 3957, 14268, 11575, 8316, 3794, 10844, 20940, 11465, 15763, 13030, 6917, 12043, 21798, 17757, 18194, 19258, 4191, 16231, 18081, 9406, 18148, 18118, 15061, 2886, 21260, 11596, 15238, 6756, 12052, 18150, 4086, 7969, 3802, 15128, 4891, 6204, 6094, 21280, 13208, 3800, 12828, 15638, 6190, 5044, 7581, 11108, 7463, 21699, 12292, 12012, 20156, 21525, 16585, 13428, 11331, 11973, 18106, 21605, 4316, 4193, 9350, 8132, 16085, 4081, 17251, 10370, 5124, 12350, 16364, 9090, 8274, 12494, 5494, 4687, 19477, 8572, 16612, 9294, 8711, 4196, 18593, 2124, 7180, 1831, 4438, 19840, 18159, 15787, 16490, 10023, 20247, 18416, 5083, 4697, 18808, 14070, 13494, 8973, 10344, 13591, 19593, 19282, 21673, 12521, 3485, 15253, 5872, 19947, 3465, 10281, 1813, 7368, 13593, 8174, 7208, 6921, 15096, 21011, 9628, 13234, 14074, 14183, 15081, 18103, 8879, 10803, 15959, 13000, 8700, 19198, 5565, 7218, 16129, 11793, 20778, 21560, 5336, 7042, 11242, 18499, 12548, 13612, 5020, 10331, 7885, 2544, 8972, 14452, 3015, 3114, 1748, 9349, 14412, 2565, 4170, 3026, 20135, 21397, 6314, 17076, 20741, 17688, 6694, 16411, 17143, 20033, 10314, 16232, 20161, 3335, 2277, 9411, 2123, 16420, 18312, 4154, 20672, 5019, 8628, 17760, 12749, 20394, 19806, 3058, 12474, 18299, 1820, 15313, 4827, 3370, 15144, 18121, 8573, 4085, 11187, 19948, 17911, 2590, 14870, 11705, 6870, 11654, 16197, 10479, 14051, 12321, 1552, 7239, 2104, 18160, 14986, 10442, 16412, 9450, 10686, 9053, 4821, 6395, 1545, 5694, 8936, 12944, 3977, 7842, 1757, 2644, 6869, 6765, 14451, 9992, 17012, 18331, 6779, 13637, 12173, 10576, 14858, 10516, 7426, 5115, 15713, 2681, 6075, 15603, 21981, 7259, 8503, 7850, 5965, 15727, 8719, 10587, 3630, 20031, 13826, 18817, 9471, 11749, 5048, 11974, 3985, 6758, 9347, 3881, 13521, 5181, 20011, 13934, 13809, 15623, 6211, 17495, 6778, 6191, 15502, 16858, 11014, 7260, 14354, 11856, 2586, 19851, 5356, 20594, 16856, 13276, 7308, 16449, 13305, 15417, 12493, 2888, 9054, 4437, 4892, 19975, 12880, 3911, 14444, 9263, 12051, 7571, 11824, 12135, 21252, 16170, 21654, 15948, 13267, 5495, 6926, 3412, 19478, 5164, 12659, 16047, 2106, 9728, 21580, 12217, 19857, 7608, 15222, 15249, 7613, 19476, 11062, 12684, 2887, 3431, 13733, 12947, 4023, 6955, 11226, 11259, 5726, 11879, 16857, 8814, 16203, 17026, 15139, 12464, 13476, 1926, 14057, 14316, 12627, 4080, 5126, 10382, 1873, 12296, 20347]

In [None]:
from pprint import pprint
import traceback

def merge_code_comment(x):
    message = x['message']
    source = ' '.join(x['tokenized_code_snippet'])
    target = ' '.join(x['target'])
    merged = message + '<>' + source + '<>' + target
    return merged

existing_ids = set()
duplicate_ids = set()
unique_dat = []
i = 0
target_size = 400
error_count = 0
error_list = []

for x in unique_data:
    #print(dt['idx'])
    #x = unique_data[dt['idx']]
    i+= 1
    try:
        focuslen = x['tokenized_code_snippet'].index('<|endfocus|>') - x['tokenized_code_snippet'].index('<|startfocus|>')
        key = merge_code_comment(x)
        if key not in existing_ids and len(x['tokenized_target']) <= target_size and focuslen <= target_size:
            existing_ids.add(key)
            unique_dat.append(i-1)
        elif key not in duplicate_ids:
            duplicate_ids.add(key)
    except Exception as e:
        print(e)
        x['error'] = e
        error_list.append(x)
        error_count += 1
        #print(x)
        #break
print(len(unique_dat))


global2loval = {}
local2global = {}
for i, d in enumerate(unique_data):
    global2loval[d['global_index']] = i
    local2global[i] = d['global_index']


training_data = []

for i in unique_dat: #1:75
    dt = unique_data[i]
    #pprint(dt)
    if dt['global_index'] not in tufano_test_50 and dt['global_index'] not in tufano_test_100:
        '''data = {}
        data['global_index'] = dt['global_index']
        data['code_snippet'] =dt['code_snippet'] 
        data['tokenized_code_snippet']=dt['tokenized_code_snippet']
        data['message']=dt['message']
        data['tokenized_comment']=dt['tokenized_comment']
        data['target']=dt['target']
        data['tokenized_target']=dt['tokenized_target']'''
        training_data.append(dt)
print(len(training_data))

In [None]:
with open('training_data.json', 'w', encoding="utf8") as f:  # writing JSON object
    json.dump(training_data, f)

## Split Train Test

In [19]:
data = getJsonData('unique_data_processed_with_multithread_v2.json')

In [20]:
data[0].keys()

dict_keys(['status', 'message', 'comment_id', 'target', 'code_snippet', 'prime_var_dic', 'class_list', 'func_list', 'tokenized_code_snippet', 'tokenized_target', 'tokenized_comment', 'global_index', 'base_code_line_number', 'base_patch_number', 'changed_patch_number', 'code_file_name', 'line_change', 'written_on', 'project_name'])

In [21]:
changed_training_data = []
statuses = ['none', 'unchanged']
for x in data:
    if x['status'] in statuses:
        continue
    changed_training_data.append(x)
print(len(changed_training_data))

68291


In [22]:
changed_training_data[0]

{'status': 'update',
 'message': 'Not using standard indentation style.',
 'comment_id': 'AAAA8H%2F%2F9%2Bw%3D',
 'target': '                CalendarPreferenceActivity.SHARED_PREFS_NAME));\n',
 'code_snippet': '    public void onCreate () {\n        addHelper(SHARED_KEY, new SharedPreferencesBackupHelper(this,\n<|startfocus|>\n                                                                CalendarPreferenceActivity.SHARED_PREFS_NAME));\n<|endfocus|>',
 'prime_var_dic': {'SHARED_KEY': 1},
 'class_list': ['CalendarBackupAgent'],
 'func_list': ['public void onCreate ()'],
 'tokenized_code_snippet': ['<|4-s|>',
  'public',
  '<|s|>',
  'void',
  '<|s|>',
  'on',
  'Create',
  '<|s|>',
  '(',
  ')',
  '<|s|>',
  '{',
  '<|nl|>',
  '<|8-s|>',
  'add',
  'Helper',
  '(',
  'SHARED',
  '_',
  'KEY',
  ',',
  '<|s|>',
  'new',
  '<|s|>',
  'Shared',
  'Preferences',
  'Backup',
  'Helper',
  '(',
  'this',
  ',',
  '<|nl|>',
  '<|startfocus|>',
  '<|nl|>',
  '<|16-s|>',
  '<|16-s|>',
  '<|16-s

In [35]:
unique_project_names = []
for x in unique_data:
    if x['project_name'] not in unique_project_names:
        unique_project_names.append(x['project_name'])
print(unique_project_names)

['android_', 'acumos_', 'asterix_', 'cloudera_', 'couchbase_', 'eclipse_', 'fd_', 'gerrithub_', 'googlereview_', 'iotivity_', 'carbonrom_', 'omnirom_', 'dirtyunicorn_', 'opencord_', 'polarsys_']


In [31]:
from datetime import datetime
# Date wise calculation
oct19 = datetime.strptime('2018-01-01 00:00:00', '%Y-%m-%d %H:%M:%S')

def to_integer(str_date_time):
    dt_time = datetime.strptime(str_date_time, '%Y-%m-%d %H:%M:%S')
    return 10000*dt_time.year + 100*dt_time.month + dt_time.day

#    date_string = x['written_on']
#    date = datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S')
#    if date > oct19:

In [32]:
for i in range(len(changed_training_data)):
    str_date = changed_training_data[i]['written_on']
    changed_training_data[i]['int_date'] = to_integer(str_date)

In [33]:
changed_training_data[0]['int_date']

20101029

In [36]:
project_wise_test = {}
for x in unique_project_names:
    project_wise_test[x] = []

In [37]:
for x in changed_training_data:
    project = x['project_name']
    project_wise_test[project].append(x)

In [34]:
import json
with open('all_data_with_date_integer.json', 'w', encoding="utf8") as f:  # writing JSON object
    json.dump(changed_training_data, f)

### Split 5% as testing data

In [43]:
from random import shuffle

train_by_project = []
test_by_project = []
for x in project_wise_test:
    print(x, len(project_wise_test[x]))
    #shuffle(project_wise_test[x])
    project_wise_test[x] = sorted(project_wise_test[x], key=lambda k: k['int_date'])
    num_test = len(project_wise_test[x])//20
    #for p in project_wise_test[x][:100]:
    #    print(p['int_date'])
    train_by_project = train_by_project + project_wise_test[x][:len(project_wise_test[x])-num_test]
    test_by_project = test_by_project + project_wise_test[x][-num_test:]
    '''
    for i in range(len(project_wise_test[x])):
        if i<num_test:
            test_by_project.append(project_wise_test[x][i])
        else:
            train_by_project.append(project_wise_test[x][i])
    '''
print(len(train_by_project), len(test_by_project))

android_ 15693
acumos_ 1097
asterix_ 10296
cloudera_ 4724
couchbase_ 1031
eclipse_ 14235
fd_ 754
gerrithub_ 1613
googlereview_ 17101
iotivity_ 1031
carbonrom_ 26
omnirom_ 292
dirtyunicorn_ 22
opencord_ 71
polarsys_ 305
64884 3407


In [44]:
import json
with open('train_by_project.json', 'w', encoding="utf8") as f:  # writing JSON object
    json.dump(train_by_project, f)

import json
with open('test_by_project.json', 'w', encoding="utf8") as f:  # writing JSON object
    json.dump(test_by_project, f)

In [47]:
train_by_project[10]

{'status': 'update',
 'message': 'If system type is an int, why not store it as an int instead of as a String, and avoid calling parseInt()?',
 'comment_id': 'z63b29270a22c525cdca47cbbc7e98d7b',
 'target': '        if(ss.getRadioTechnology() == ServiceState.RADIO_TECHNOLOGY_1xRTT \n                || ss.getRadioTechnology() == ServiceState.RADIO_TECHNOLOGY_EVDO_0\n                || ss.getRadioTechnology() == ServiceState.RADIO_TECHNOLOGY_EVDO_A \n                || ss.getRadioTechnology() == ServiceState.RADIO_TECHNOLOGY_IS95A\n                || ss.getRadioTechnology() == ServiceState.RADIO_TECHNOLOGY_IS95B) {\n            switch(ss.getExtendedCdmaRoaming()) {\n',
 'code_snippet': '        else if (asu >= 8)  asu = 3;\n        else if (asu >= 4)  asu = 2;\n        else asu = 1;\n\n        int[] iconList;\n        if (mPhone.isNetworkRoaming()) {\n            iconList = sSignalImages_r;\n        } else {\n            iconList = sSignalImages;\n        }\n        \n<|startfocus|>\n    

Number of total data: 85594
Number of changed data: 68535
none: 652
unchanged: 16407
insert: 5262
delete: 11828
update: 51445

In [38]:
a = [2, 3]
b = [5, 6]

[2, 3, 5, 6]


In [None]:
project_wise_test

In [None]:
datetime_object

In [10]:
import json
def getJsonData(JsonFile):
    with open(JsonFile, encoding="utf8") as f:
        data = json.load(f)
    return data
def read(path):
    return open(path, 'r', encoding = 'utf-8').read()


BASE="C:/research_stuff/codes/"
unique = getJsonData('E:/APR/DATA/unique_data_with_date_and_index.json')
unique_data = getJsonData('unique_data_processed_with_Date_multithread_idx_400_feb_06_v2.json')

In [None]:
for idx , dt in enumerate(unique_data):
    if(len(dt['tokenized_code_snippet']))>400:
        #print(unique_data['global_index'])
        #print("index = ", unique_data[idx]['global_index'])
        #print(len(process(unique[unique_data[idx]['global_index']])['tokenized_code_snippet']))
        if len(process(unique[unique_data[idx]['global_index']])['tokenized_code_snippet']) >400:
            print("index = ", unique_data[idx]['global_index'], len(process(unique[unique_data[idx]['global_index']])['tokenized_code_snippet']))
#process(unique_data[545])['code_snippet']

In [14]:

print(len(unique_data))
ct=0
for data in unique_data:
    if data['code_snippet'][:14] == "<|startfocus|>" or data['code_snippet'][-13:] == "\n<|endfocus|>":
        ct+=1
print(ct)

95352
11879


In [15]:
def merge_code_comment(x):
    message = x['message']
    source = ' '.join(x['tokenized_code_snippet'])
    target = ' '.join(x['target'])
    merged = message + '<>' + source + '<>' + target
    return merged

existing_ids = set()
duplicate_ids = set()
unique_dat = []
i = 0
target_size = 400
error_count = 0
error_list = []

for x in unique_data:
    #print(dt['idx'])
    #x = unique_data[dt['idx']]
    i+= 1
    try:
        focuslen = x['tokenized_code_snippet'].index('<|endfocus|>') - x['tokenized_code_snippet'].index('<|startfocus|>')
        key = merge_code_comment(x)
        if key not in existing_ids and len(x['tokenized_target']) <= target_size and focuslen <= target_size:
            existing_ids.add(key)
            unique_dat.append(x)
        elif key not in duplicate_ids:
            duplicate_ids.add(key)
    except Exception as e:
        #print(e)
        #x['error'] = e
        error_list.append(x)
        error_count += 1
        #print(x)
        #break
print(len(unique_dat))

85440


In [16]:

print(len(unique_dat))
ct=0
for data in unique_dat:
    if data['code_snippet'][:14] == "<|startfocus|>" or data['code_snippet'][-13:] == "\n<|endfocus|>":
        ct+=1
print(ct)

85440
10632


In [49]:
pwd

'E:\\APR\\RepairFromReviewTrainingData'