In [1]:
def collect_conflict(data, conf_dict):
    for conflict in data['conflicting_chunks']:
        if 'label' in conflict:
            conf_dict.append({
                'A': conflict['a_contents'],
                'B': conflict['b_contents'],
                'base': conflict['base_contents'],
                'label': conflict['label'] if conflict['label'] == 'A' or conflict['label'] == 'B' else  'N'
            })


def extract_feature(conflict):
    return {
        **extract_keywords(conflict),
        **extract_edit_type(conflict),
        **extract_exist(conflict)
    }


def extract_keywords(conflict):
    def delete_brace(text):
        while text.find('{') == -1:
            start_pos = text.find('{')
            end_pos = text.find('}', start_pos)
            if end_pos == -1:
                break
            text = text[0: start_pos] + (text[end_pos + 1:] if end_pos != len(text) - 1 else [])
        return text

    def get_keywords(text, nums):
        text = delete_brace(text)
        start_pos = {text.find(keyword) : keyword for keyword in keywords if text.find(keyword) > -1}
        sorted_keywords = [start_pos[pos] for pos in sorted(start_pos.keys())]
        return [labels[key] for key in (sorted_keywords + ['empty']*nums)[:nums]]

    keywords = [
        'import',
        'private',
        'public',
        'protected',
        '.',
        '=',
        'if',
        'else',
        'for',
        'while',
        'return',
    ]
    labels = {
        'empty':0,
        'import': 1,
        'private': 2,
        'public': 2,
        'protected': 2,
        'if': 3,
        'else': 3,
        'for': 4,
        'while': 4,
        '.': 5,
        '=': 6,
        'return': 7,
    }
    num_kw = 3
    a_kw = get_keywords(conflict['A'], num_kw)
    b_kw = get_keywords(conflict['B'], num_kw)
    return {
        'a_keyword_1': a_kw[0],
        'a_keyword_2': a_kw[1],
        'a_keyword_3': a_kw[1],
        'b_keyword_1': b_kw[0],
        'b_keyword_2': b_kw[1],
        'b_keyword_3': a_kw[1],
    }


def extract_edit_type(conflict):
    def get_edit_type_line(cur, base):
        cur = [line for line in str.splitlines(cur) if len(line) > 0]
        base = [line for line in str.splitlines(base) if len(line) >0]
        if len(cur) == 0:
            return 0
        else:
            return len([line for line in cur if line in base])/len(cur)
    
    get_edit_type = get_edit_type_line
    return {
        'a_edit_type': get_edit_type(conflict['A'], conflict['base']),
        'b_edit_type': get_edit_type(conflict['B'], conflict['base']),
        'ab_edit_type':get_edit_type(conflict['A'], conflict['B'])
    }


def extract_exist(conflict):
    return {
        'a_exist': False if conflict['A'] == '' or conflict['A'] == '\n' else True,
        'b_exist': False if conflict['B'] == '' or conflict['B'] == '\n' else True,
        'base_exist': False if conflict['base'] == '' or conflict['base'] == '\n' else True
    }

In [4]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
import json
import random
import pandas as pd

def sample_normal(dataset):
    label_set = set([sample['label'] for sample in dataset])
    label_dict = {}
    for label in label_set:
        label_dict[label] = [sample for sample in dataset if sample['label'] == label]
    min_size = min([len(samples) for (label, samples) in label_dict.items()])
    for key in label_dict.keys():
        random.shuffle(label_dict[key])
    return [sample for samples in label_dict.values() for sample in samples[:min_size]]


dataset = {}
with open('./abm.json', 'r', encoding='utf-8') as jfile:
    dataset = json.load(jfile)
dataset = dataset['conf']
dataset = sample_normal(dataset)
random.shuffle(dataset)

features = [
    'a_keyword_1',
    'b_keyword_1',
    'a_keyword_2',
    'b_keyword_2',
    'a_exist',
    'b_exist',
    'base_exist',
    'a_edit_type',
    'b_edit_type',
    'ab_edit_type',
]
train = [{key:value for (key, value) in data.items() if key in features} for data in dataset]
vec=DictVectorizer(sparse=False)
train = vec.fit_transform(train)
target = [data['label'] for data in dataset]


# train = [
#     [
#         data['a_keyword_1'],
#         data['b_keyword_1'],
#         data['a_keyword_2'],
#         data['b_keyword_2'],
#         data['a_exist'],
#         data['b_exist'],
#         data['base_exist'],
#         data['a_edit_type'],
#         data['b_edit_type'],
#         data['ab_edit_type'],
#     ] for data in dataset]
# target = [data['label'] for data in dataset]

x_train, x_test, y_train, y_test = train_test_split(train, target, test_size=0.2)

clf = tree.DecisionTreeClassifier()
# clf = RandomForestClassifier(max_depth=15, min_samples_split=5, random_state=0)
clf.fit(x_train, y_train)
print(accuracy_score(y_test, clf.predict(x_test)))

0.6070630932439978


In [7]:
conflict = {
    "a_contents": "",
    "b_contents": "\nimport org.n52.sos.util.Constants;\nimport org.slf4j.Logger;\nimport org.slf4j.LoggerFactory;\n",
    "base_contents": "import org.n52.sos.util.Constants;\nimport org.n52.sos.w3c.xlink.W3CHrefAttribute;\nimport org.slf4j.Logger;\nimport org.slf4j.LoggerFactory;\n",
}
conflict = {
    'A': conflict['a_contents'],
    'B': conflict['b_contents'],
    'base': conflict['base_contents']
}
data = extract_feature(conflict)
sample = [        
    data['a_keyword_1'], 
    data['b_keyword_1'],
    data['a_keyword_2'],
    data['b_keyword_2'],
    data['a_exist'],
    data['b_exist'],
    data['base_exist'],
    data['a_edit_type'],
    data['b_edit_type'],
    data['ab_edit_type'],]
clf.predict([sample])

array(['A'], dtype='<U1')

In [10]:
import graphviz


dot_data = tree.export_graphviz(clf, out_file=None, 
                     feature_names=vec.get_feature_names(),  
                     filled=True, rounded=True,  
                     special_characters=True,
                     max_depth=5)  
graph = graphviz.Source(dot_data)  
graph.render('tree')



'tree.pdf'

In [1]:
import jpype
import os

jvmPath = jpype.getDefaultJVMPath()
jarPath = os.path.join(os.path.abspath('G:/project/java/ASTExtract/out/artifacts/ASTExtract_jar/ASTExtract.jar'))
jpype.startJVM(jvmPath, '-ea', '-Djava.class.path=%s' % (jarPath))


In [6]:
jpype.shutdownJVM()

In [4]:
javaClass = jpype.JClass('nju.merge.ASTExtractor')
javaInstance = javaClass()
[a, b] = javaInstance.call('54_a.java', 94, 105)
print(a)
print(b)

11
6


In [36]:
import re
regex = '[a-z|A-Z|_][a-z|A-Z|_|0-9]*'
s = 'fweaf_fewf_ffe'
if re.fullmatch(regex, s):
    print(1)
else:
    print(2)



1


In [27]:
a = 'boolean byte char double false float int long new short true void instanceof break case catch continue default do else for if reture switch try while finally throw this super abstract fianal native private protected public static synchronized transient volatile class extend implements interface ackage import throws'
b = a.split(' ')
# c = [ "\"" + word + "\"" for word in b]
print(b)

['boolean', 'byte', 'char', 'double', 'false', 'float', 'int', 'long', 'new', 'short', 'true', 'void', 'instanceof', 'break', 'case', 'catch', 'continue', 'default', 'do', 'else', 'for', 'if', 'reture', 'switch', 'try', 'while', 'finally', 'throw', 'this', 'super', 'abstract', 'fianal', 'native', 'private', 'protected', 'public', 'static', 'synchronized', 'transient', 'volatile', 'class', 'extend', 'implements', 'interface', 'ackage', 'import', 'throws']
