In [1]:
import os
import numpy

In [2]:
path_to_log_files = "/mnt/media/Experiments/SLP-Core"
path_to_orig_files = "/mnt/media/Corpora/extern/Github_Java_Corpus/split"
variants = ["01percent", "10percent", "50percent"]
modi = ["training", "validation"]

delimiter = ["<|CONTEXT|>", "<|MEASURE|>", "<|GENERATED_COMPLETIONS|>", "<|ALL_COMPLETIONS|>"]

In [3]:

modus = 1
variant = 0

current_dataset = "single_file_chopped_experiment"
current_experiment = "smaller_" + variants[variant]
current_orig_file_path = os.path.join(path_to_orig_files, modi[modus] + "_smaller_" + variants[variant])

In [4]:
"""
Read a file and return its content as a string
[input]: full path to file
[output]: content of the file as a string
"""
def read_file(path):
    with open(path, "r") as f:
        content = f.read()
    return content

"""
split the given log file string into its components as given by the delimiter.
delimiter have to follow the log file standard
[input]: log file content as a string
[output]: dict containing the content for each log file part with its corresponding delimiter as key
"""
def split_log_file(content):
    result = {}
    part = ""
    lines = content.split("\n")
    delim = lines[0].strip()
    for i in range(1, len(lines)):
        if lines[i].strip() in delimiter:
            result[delim] = part
            delim = lines[i].strip()
            part = ""
            continue
        part += lines[i] + "\n"
    result[delim] = part
    return result

"""
process the given completion log file string and put all generated completions in a list as tuple: 
completion-string:probability
[input]: string holding the generated completions in each line
[output]: list holding each completion:probability tuple as elements
"""
def process_generated_completions(generated_completion_text):
    lines = generated_completion_text.split("\n")
    result = []
    lines = [line for line in lines if len(line) > 0]
    for line in lines:
        chunks = line.split(" -- ")
        completion = chunks[0]
        probability = float(chunks[1][1:-1])
        result.append([completion, probability])
    return result

"""
process the given full completion string of intellij proposal mechanism (including the generated completions)
[input]: string holding all completions in each line
[output]: list holding triplet with the position, the completion string, and wheter its source is generated
          or intellij
"""
def process_all_completions(all_completions_text):
    # [position, text]
    generated_text_overhead = ["LookupElementBuilder: string=", "; handler=null"]
    result = []
    lines = all_completions_text.split("\n")
    lines = [line for line in lines if len(line) > 0]
    for i, line in enumerate(lines):
        # check if we have a generated completion
        if any(check in line for check in generated_text_overhead):
            tag = "generated"
            element = line[len(generated_text_overhead[0]):-len(generated_text_overhead[1])]
        else:
            tag = "intellij"
            element = line
        result.append([i, element, tag])
    return result


"""
get all files of the given path inside the subfolder and dataset folder
[input]: path: path to log files
         dataset: which dataset is used (01, 10, 50)
         subfolder: which experiment is used (single file chopped, etc.)
[output]: list holding all log files in the directory
"""
def get_log_files(path, dataset, subfolder):
    new_path = os.path.join(path, subfolder)
    new_path = os.path.join(new_path, dataset)
    files = os.listdir(new_path)
    return [os.path.join(new_path, file) for file in files if file.endswith(".log")]

"""
get all files of the verification set ending with .java
[input]: path to the root of the verification set
[output]: list holding the full paths for all files in the dataset
"""
def get_orig_files(path):
    result = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".java"):
                result.append(os.path.join(root, file))
    return result

"""
map the context of the log file to its original java file in the dataset
[input]: path: the path to the logfile
         orig_file_paths: list of all original files
[output]: list of the files with the same name in the validation dataset
"""
def get_orig_file_path_from_log_file(path, orig_file_paths):
    base_name = os.path.basename(path)
    if base_name.endswith(".log"):
        base_name = base_name[:-4]
    multi = []
    for p in orig_file_paths:
        if base_name in p:
            multi.append(p)
    return multi

"""
get the true original file corresponding to the given context
[input]: chopped_text: the context of the log file
         file_list: the list of the files with the same name in the validation dataset
[output]: the paths to the files with the same name in the validation dataset
"""
def get_orignal_file(chopped_text, file_list):
    target = []
    chopped_text_lines = chopped_text.split("\n")
    for file in file_list:
        with open(file, "r") as f:
            text = f.read()
        if text.startswith("\n".join(chopped_text_lines[:-2])):
            target.append(file)
    return target

def get_next_token_in_orig_file():
    pass

In [8]:
# 1st step: get all log files
log_file_full_paths = get_log_files(path_to_log_files, current_dataset, current_experiment)

# get all paths of all original files - have to find later that files we used for experiment
all_original_file_paths = get_orig_files(current_orig_file_path)

# 2nd step: get all corresponding original files
files_with_same_name = []
for log_file in log_file_full_paths:
    files_with_same_name.append(get_orig_file_path_from_log_file(log_file, all_original_file_paths))

In [11]:
for i in range(len(log_file_full_paths)):
    if len(files_with_same_name[i]) != 1:
        # TODO: hier weiter machen!

8
19
3


In [5]:
#orig_files = [get_orig_file_path_from_log_file(path, all_original_file_paths) for path in log_file_full_paths]
#orig_files_cleaned = orig_files.copy()
#for i, file in enumerate(orig_files):
#    if len(file) > 1:
#        base_name = os.path.basename(file[0]) + ".log"
#        p = path_to_log_files + "/" + current_experiment + "/" + current_dataset + "/" + base_name
#        p_text = read_file(p)
#        orig_files_cleaned[i] = get_orignal_file(p_text, file)
#    else:
#        orig_files_cleaned[i] = file
#        

#for i, item in enumerate(orig_files_cleaned):
#    if len(item) != 1:
#        print(log_file_full_paths[i])
#        #print(orig_files[i])
#        print(len(orig_files_cleaned[i]))

#orig_files = [file[0] for file in orig_files]
#orig_file_text = "\n".join(orig_files)
#with open("original_file_paths", "w") as f:
#    f.write(orig_file_text)

In [76]:
#log_files = get_log_files(path_to_log_files, current_dataset, current_experiment)
#print(log_files)
orig_files = get_orig_files(current_orig_file_path)
hans = get_orig_file_path_from_log_file("/mnt/media/Experiments/SLP-Core/smaller_50percent/single_file_chopped_experiment/JMXTestRunnerTestCase.java.log", orig_files)
print(hans)

['/mnt/media/Corpora/extern/Github_Java_Corpus/split/validation_smaller_01percent/arquillian_deprecated/protocols/jmx/src/test/java/org/jboss/arquillian/jmx/JMXTestRunnerTestCase.java']


In [46]:
test = "/mnt/media/Experiments/SLP-Core/smaller_50percent/single_file_chopped_experiment/JMXTestRunnerTestCase.java.log"
text = read_file(test)
bla = split_log_file(text)
foo = process_generated_completions(bla[delimiter[2]])
baz = process_all_completions(bla[delimiter[3]])
for item in baz:
    print(item)
    
#print(foo)
#for key, item in bla.items():
#    print(key)
#    print(item)

[0, 'execute', 'generated']
[1, 'getResults', 'generated']
[2, 'getTestCase', 'generated']
[3, 'mask', 'generated']
[4, 'start', 'generated']
[5, '.arg', 'intellij']
[6, '.cast', 'intellij']
[7, '.field', 'intellij']
[8, '.par', 'intellij']
[9, '.try', 'intellij']
[10, '.var', 'intellij']
