In [1]:
path_to_repo = "./sample_java_repo/csc-java-course-spring-2023-key-value-store-EgorShibaev/"
method1_name = "org.csc.java.spring2023.KeyValueStoreImplementation.openValueStream"
method2_name = "org.csc.java.spring2023.KeyValueStoreImplementation.getIndexManager"

In [2]:
import os
import re

def extract_method_code(repository_path, method_name):
    """
    Extracts the code of the method with the given name from the repository
    :param repository_path: path to the repository
    :param method_name: name of the method to extract
    :return: code of the method
    """
    
    package, class_name, method_name = method_name.rsplit(".", 2)
    package_path = package.replace(".", os.path.sep)
    file_path = os.path.join(repository_path, "src", "main", "java", package_path, f"{class_name}.java")

    with open(file_path, "r") as file:
        content = file.read()

    method_code = extract_method_code_from_content(content, method_name)
    return method_code

def extract_method_code_from_content(java_code, method_name):
    """
    Extracts the code of the method with the given name from the Java code
    :param java_code: Java code
    :param method_name: name of the method to extract
    :return: code of the method
    """

    method_pattern = re.compile(rf"\n\s*(\w+\s+)*{method_name}\s*\([^\)]*\)(\s*throws\s+[\w,]+)?\s*\{{")


    match = method_pattern.search(java_code)
    if not match:
        raise ValueError(f"Method {method_name} not found in the Java code")

    open_braces = 0
    method_start = match.start()
    method_end = -1

    for i in range(method_start, len(java_code)):
        if java_code[i] == "{":
            open_braces += 1
        elif java_code[i] == "}":
            open_braces -= 1
            if open_braces == 0:
                method_end = i + 1
                break

    if method_end == -1:
        raise ValueError(f"Method {method_name} code extraction failed")

    return java_code[method_start:method_end]

In [3]:
method1_code = extract_method_code(path_to_repo, method1_name)
method2_code = extract_method_code(path_to_repo, method2_name)

In [4]:
method1_code

'\n  public InputStream openValueStream(byte[] key) throws IOException {\n    Objects.requireNonNull(key);\n    if (!contains(key)) {\n      throw new IOException("No such key in store");\n    }\n    check_closed();\n\n\n    var blocks = indexManager.getFileBlocksLocations(key);\n    var stream = InputStream.nullInputStream();\n\n    for (var block : blocks) {\n      stream = new SequenceInputStream(stream, valueStoreManager.openBlockStream(block));\n    }\n    return stream;\n  }'

In [5]:
method2_code

'\n  public IndexManager getIndexManager() {\n    check_closed();\n\n    return indexManager;\n  }'

In [6]:
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

# classes-wrappers for different models
class SentenceTransformerModel:

    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)
    
    def encode(self, method_code):
        return self.model.encode(method_code, convert_to_tensor=True).view(1, -1)

class HuggingfaceModel:

    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
    
    def encode(self, method_code):
        inputs = self.tokenizer(method_code, return_tensors="pt")
        outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

In [7]:
models = {
    "CodeGPT": HuggingfaceModel("microsoft/CodeGPT-small-java-adaptedGPT2"),
    "CodeBERT": HuggingfaceModel("neulab/codebert-java"),
    "MiniLM": SentenceTransformerModel("all-MiniLM-L6-v2"),
}

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of RobertaModel were not initialized from the model checkpoint at neulab/codebert-java and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import torch

def get_similarity(model_name, method1_code, method2_code):

    with torch.no_grad():
        embedding_1 = models[model_name].encode(method1_code)
        embedding_2 = models[model_name].encode(method2_code)

    similarity = torch.nn.functional.cosine_similarity(embedding_1, embedding_2)

    return similarity.item()

In [9]:
for model_name in models:
    similarity = get_similarity(model_name, method1_code, method2_code)
    print(f"Similarity by {model_name}: {similarity:.3f}")

Similarity by CodeGPT: 0.660
Similarity by CodeBERT: 0.920
Similarity by MiniLM: 0.378


## Some comparison of models

In [10]:
def compare_models(code1, code2):
    for model_name in models:
        similarity = get_similarity(model_name, code1, code2)
        print(f"Similarity by {model_name}: {similarity:.3f}")

In [11]:
compare_models("""
public int factorial(int n) {
    if (n == 0) {
        return 1;
    } else {
        return n * factorial(n - 1);
    }
}
""","""
public String[] parseCSVLine(String csvLine) {
    return csvLine.split(",");
}
""")

Similarity by CodeGPT: 0.441
Similarity by CodeBERT: 0.906
Similarity by MiniLM: 0.014


In [12]:
compare_models("""
public int calculateSquare(int num) {
    return num * num;
}
""","""
public boolean isNumeric(String str) {
    try {
        Double.parseDouble(str);
        return true;
    } catch (NumberFormatException e) {
        return false;
    }
}
""")

Similarity by CodeGPT: 0.672
Similarity by CodeBERT: 0.902
Similarity by MiniLM: 0.385


In [13]:
compare_models("""
public int findMax(int[] arr) {
    int max = arr[0];
    for (int i = 1; i < arr.length; i++) {
        if (arr[i] > max) {
            max = arr[i];
        }
    }
    return max;
}
""","""
public String toUpperCase(String input) {
    return input.toUpperCase();
}
""")

Similarity by CodeGPT: 0.453
Similarity by CodeBERT: 0.867
Similarity by MiniLM: 0.036


In [14]:
compare_models("""
public static int sum(List<Integer> numbers) {
        int sum = 0;
        for (int num : numbers) {
            sum += num;
        }
        return sum;
}
""","""
public static int product(List<Integer> numbers) {
        int product = 1;
        for (int num : numbers) {
            product *= num;
        }
        return product;
}
""")

Similarity by CodeGPT: 0.968
Similarity by CodeBERT: 0.998
Similarity by MiniLM: 0.601


In [15]:
compare_models("""
public static int sqrt(int a) {
    return a * a;
}
""","""
public static long sqrt(int a) {
    return Math.pow(a, 2);
}
""")

Similarity by CodeGPT: 0.840
Similarity by CodeBERT: 0.973
Similarity by MiniLM: 0.768


### Resume

I compared 3 models on pairs of java methods. 
- all-MiniLM-L6-v2 from Sentence Transformers library is not the best choice for this task because it was trained on large amounts of text data to generate sentence embeddings that capture the semantic meaning of the __text__, so, it is not suitable for code similarity task.
- CodeGPT seems to perform better than CodeBERT for this task. So, I will use CodeGPT for the second task.

In [18]:
!jupyter nbconvert --to html task1.ipynb

[NbConvertApp] Converting notebook task1.ipynb to html
[NbConvertApp] Writing 639698 bytes to task1.html
