#Install

In [1]:
!apt-get update
!pip install transformers[sentencepiece]
!pip install transformers[torch]
!pip install datasets
!pip install accelerate
!pip install transformers==4.30.2
!pip install evaluate
!pip install sigopt
!pip install pickle5
!pip install ray[tune]
!pip install javalang
!pip install sacrebleu

%env PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:10 http://security.ubuntu.com/ubuntu jammy-security InRelease
Reading package lists... Done
env: PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512


#Imports

In [2]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, DataCollatorWithPadding, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoTokenizer
import datasets
import torch
import numpy as np
import evaluate
import os
import re
import javalang
import pandas as pd

# Constants

In [3]:
mode = "load_dataset"
dir = '/content/drive/MyDrive/'
train_file_name = dir + 'train.csv'
validate_file_name = dir + 'validate.csv'
test_file_name = dir + 'test.csv'
folder_path = "drive/MyDrive/java-design-patterns"
checkpoint = 'Salesforce/codet5-base'
device = "cpu"
batch_size = 4
max_method_length = 1000
max_name_length = 50

  # Dataset

In [4]:
def __get_start_end_for_node(node_to_find, tree):
    start = None
    end = None
    for path, node in tree:
        if start is not None and node_to_find not in path:
            end = node.position
            return start, end
        if start is None and node == node_to_find:
            start = node.position
    return start, end


def __get_string(start, end, data):
    if start is None:
        return ""
    end_pos = None
    if end is not None:
        end_pos = end.line - 1
    lines = data.splitlines()
    string = "".join(lines[start.line:end_pos])
    string = lines[start.line - 1] + string
    if end is None:
        left = string.count("{")
        right = string.count("}")
        if right - left == 1:
            p = string.rfind("}")
            string = string[:p]
    return string

def extract_methods_and_names(src_code):
    tree = javalang.parse.parse(src_code)
    methods = []
    for path, node in tree.filter(javalang.tree.MethodDeclaration):
        start, end = __get_start_end_for_node(node, tree)
        methods.append((node.name, __get_string(start, end, src_code)))
    return methods

def read_src_files(folder_path):
    src_files = []
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith(".java"):
                src_files.append(os.path.join(root, filename))
    return src_files

def process_src_files(folder_path):
    dataset = {'method': [], 'name': []}

    src_files = read_src_files(folder_path)

    for src_file in src_files:
        with open(src_file, 'r', encoding='utf-8') as file:
            src_code = file.read()
            try:
                methods_and_names = extract_methods_and_names(src_code)
            except:
                continue

            for method_name, method_body in methods_and_names:
                dataset['method'].append(method_body.replace(method_name, '<extra_id_0>', 1).replace('\\n', ''))
                dataset['name'].append(method_name)

    return pd.DataFrame(dataset)

In [5]:
if mode == "create_dataset":
    df = process_src_files(folder_path)
    df.sample(frac=1)
    train, validate, test = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

    train.to_csv(train_file_name, encoding='utf-8')
    validate.to_csv(validate_file_name, encoding='utf-8')
    test.to_csv(test_file_name, encoding='utf-8')

    print(df.head())
    print(df.shape)


In [6]:
dataset = datasets.load_dataset('csv', data_files={"train": train_file_name, "validation": validate_file_name, "test": test_file_name}, keep_default_na=False)

# Train

In [7]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, torch_dtype=torch.float32, trust_remote_code=True,).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)

config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

In [8]:
def tokenize_batch(batch):
    encoding = tokenizer(batch['method'], max_length=max_method_length, padding="max_length", truncation=True, return_tensors='pt')
    labels = tokenizer(batch['name'], max_length=max_name_length, padding="max_length", truncation=True, return_tensors='pt')
    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': labels['input_ids'].flatten()
    }

train_dataset = dataset["train"].map(tokenize_batch)
validation_dataset = dataset["validation"].map(tokenize_batch)
print(train_dataset)
print(validation_dataset)

Map:   0%|          | 0/2331 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

Dataset({
    features: ['Unnamed: 0', 'method', 'name', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2331
})
Dataset({
    features: ['Unnamed: 0', 'method', 'name', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 777
})


In [9]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./method_name_prediction",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    save_strategy="steps",
)

In [10]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
)

In [11]:
trainer.train()



Step,Training Loss,Validation Loss
100,0.3678,0.150167
200,0.1654,0.133889
300,0.1332,0.127437
400,0.1464,0.119129
500,0.1302,0.113514
600,0.1162,0.108897
700,0.0735,0.107116
800,0.0751,0.107224
900,0.0691,0.10294
1000,0.0806,0.101994


TrainOutput(global_step=1749, training_loss=0.09742616393894656, metrics={'train_runtime': 1397.8593, 'train_samples_per_second': 5.003, 'train_steps_per_second': 1.251, 'total_flos': 8317270563840000.0, 'train_loss': 0.09742616393894656, 'epoch': 3.0})

In [12]:
trainer.evaluate()

{'eval_loss': 0.10080233961343765,
 'eval_runtime': 27.0412,
 'eval_samples_per_second': 28.734,
 'eval_steps_per_second': 7.211,
 'epoch': 3.0}

In [13]:
model.save_pretrained(dir + "fine_tuned_method_name_model")
tokenizer.save_pretrained(dir + "fine_tuned_method_name_model")

('/content/drive/MyDrive/fine_tuned_method_name_model/tokenizer_config.json',
 '/content/drive/MyDrive/fine_tuned_method_name_model/special_tokens_map.json',
 '/content/drive/MyDrive/fine_tuned_method_name_model/vocab.json',
 '/content/drive/MyDrive/fine_tuned_method_name_model/merges.txt',
 '/content/drive/MyDrive/fine_tuned_method_name_model/added_tokens.json',
 '/content/drive/MyDrive/fine_tuned_method_name_model/tokenizer.json')

# Results

In [7]:
fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(dir + "fine_tuned_method_name_model")
tokenizer = AutoTokenizer.from_pretrained(dir + "fine_tuned_method_name_model")

In [8]:
def tokenize_batch(batch):
    encoding = tokenizer(batch['method'], max_length=max_method_length, padding="max_length", truncation=True, return_tensors='pt')
    labels = tokenizer(batch['name'], max_length=max_name_length, padding="max_length", truncation=True, return_tensors='pt')
    return {
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'labels': labels['input_ids'].flatten()
    }

In [9]:
test_dataset = dataset["test"].map(tokenize_batch)
def generate_predictions(dataset, model, tokenizer):
    input_ids = torch.tensor(dataset['input_ids'])
    attention_mask = torch.tensor(dataset['attention_mask'])
    with torch.no_grad():
        output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_name_length, num_beams=5)
    decoded_predictions = tokenizer.batch_decode(output, skip_special_tokens=True)
    return {'predicted_names': decoded_predictions}

In [10]:
def accuracy(predictions, answers):
  diff = []
  acc = 0
  for i in range(len(answers)):
    if predictions[i] == answers[i]:
      acc += 1
    else:
      diff.append((predictions[i], answers[i]))
  return f"{acc}/{len(answers)}", diff

In [11]:
raw_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, torch_dtype=torch.float32, trust_remote_code=True,).to(device)
raw_tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)
test_dataset = test_dataset.shuffle(seed=42)[:10] # otherwise it takes too many memory, colab v100 crashes
predicted_method_names_fine_tuned = generate_predictions(test_dataset, fine_tuned_model, tokenizer)['predicted_names']
predicted_method_names_raw = generate_predictions(test_dataset, raw_model, tokenizer)['predicted_names']
print("Fine tuned model accuracy and difference with answer:", accuracy(predicted_method_names_fine_tuned, test_dataset['name']))
print("Raw model accuracy and difference with answer:", accuracy(predicted_method_names_raw, test_dataset['name']))

Fine tuned model accuracy and difference with answer: ('7/10', [('visit', 'render'), ('testUpdateForPatrollingRight', 'testUpdateForPatrollingLeft'), ('call', 'attemptRequest')])
Raw model accuracy and difference with answer: ('1/10', [('getShortName( ) { eturn unction', 'getName'), ('Foo < T >', 'render'), ('getMovement ( ) { if', 'getMovement'), ('testPatrollingLeft', 'testUpdateForPatrollingLeft'), ('executereturn result; }}', 'execute'), ('testPlaySoundWithVolume1public void playSoundWithVolume1() {', 'testPlaySound'), ('test_send_heartbeat_messagepublic void test_send_heartbeat_message() {', 'testSendHeartbeatMessage'), ('testNoMoreInteractions', 'testRun'), ('getRemoteHostAddress () { public RemoteHostAddress(String hostAddress)throwsRemoteServiceException,RemoteException{getRemoteHostAddress()}getRemoteHostAddress();getRemoteHostAddress();getRemoteHostAddress();getRemoteHostAddress', 'attemptRequest')])
