In [1]:
path_to_repo = "./sample_java_repo/csc-java-course-spring-2023-key-value-store-EgorShibaev/"
method1_name = "org.csc.java.spring2023.KeyValueStoreImplementation.openValueStream"
method2_name = "org.csc.java.spring2023.KeyValueStoreImplementation.getIndexManager"

In [2]:
import os
import re

def extract_method_code(repository_path, method_name):
    """
    Extracts the code of the method with the given name from the repository
    :param repository_path: path to the repository
    :param method_name: name of the method to extract
    :return: code of the method
    """
    
    package, class_name, method_name = method_name.rsplit(".", 2)
    package_path = package.replace(".", os.path.sep)
    file_path = os.path.join(repository_path, "src", "main", "java", package_path, f"{class_name}.java")

    with open(file_path, "r") as file:
        content = file.read()

    method_code = extract_method_code_from_content(content, method_name)
    return method_code

def extract_method_code_from_content(java_code, method_name):
    """
    Extracts the code of the method with the given name from the Java code
    :param java_code: Java code
    :param method_name: name of the method to extract
    :return: code of the method
    """

    method_pattern = re.compile(rf"\n\s*(\w+\s+)*{method_name}\s*\([^\)]*\)(\s*throws\s+[\w,]+)?\s*\{{")


    match = method_pattern.search(java_code)
    if not match:
        raise ValueError(f"Method {method_name} not found in the Java code")

    open_braces = 0
    method_start = match.start()
    method_end = -1

    for i in range(method_start, len(java_code)):
        if java_code[i] == "{":
            open_braces += 1
        elif java_code[i] == "}":
            open_braces -= 1
            if open_braces == 0:
                method_end = i + 1
                break

    if method_end == -1:
        raise ValueError(f"Method {method_name} code extraction failed")

    return java_code[method_start:method_end]

In [3]:
method1_code = extract_method_code(path_to_repo, method1_name)
method2_code = extract_method_code(path_to_repo, method2_name)

In [10]:
method1_code

'\n  public InputStream openValueStream(byte[] key) throws IOException {\n    Objects.requireNonNull(key);\n    if (!contains(key)) {\n      throw new IOException("No such key in store");\n    }\n    check_closed();\n\n\n    var blocks = indexManager.getFileBlocksLocations(key);\n    var stream = InputStream.nullInputStream();\n\n    for (var block : blocks) {\n      stream = new SequenceInputStream(stream, valueStoreManager.openBlockStream(block));\n    }\n    return stream;\n  }'

In [11]:
method2_code

'\n  public IndexManager getIndexManager() {\n    check_closed();\n\n    return indexManager;\n  }'

In [4]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("microsoft/CodeGPT-small-java-adaptedGPT2")
model = AutoModel.from_pretrained("microsoft/CodeGPT-small-java-adaptedGPT2")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
inputs_1 = tokenizer(method1_code, return_tensors="pt")
inputs_2 = tokenizer(method2_code, return_tensors="pt")

In [6]:
import torch

with torch.no_grad():
    embeddings_1 = model(**inputs_1).last_hidden_state.mean(dim=1)
    embeddings_2 = model(**inputs_2).last_hidden_state.mean(dim=1)

similarity = torch.nn.functional.cosine_similarity(embeddings_1, embeddings_2)

print(f"Similarity of two methods is {similarity.item():.3f}")

Similarity of two methods is 0.660


In [7]:
!jupyter nbconvert --to html task1.ipynb

[NbConvertApp] Converting notebook task1.ipynb to html
[NbConvertApp] Writing 620613 bytes to task1.html
