In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import math
import time, pickle, math, warnings, os, operator
import string 
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate import bleu_score
import time

The following path could be changed to your own file path

In [2]:
base = './dataset/'
path_train = base+'train.tsv'
path_test_source = base+'source.txt'
path_test_target = base+'target.txt'

train_dataset = [line.strip() for line in open(path_train)]
source_test = [line.strip() for line in open(path_test_source)]
target_test = [line.strip() for line in open(path_test_target)]

punctuations = string.punctuation.replace("\"","")

In [3]:
def processTrainDataset(train_dataset):
    source_train = []
    target_train = []
    for data in train_dataset:
        data_group = data.split("\t")
        target = data_group[1]
        source = data_group[0].translate(str.maketrans({key: " {0} ".format(key) for key in punctuations}))
        source_train.append(source)
        target_train.append(target)
    return source_train,target_train

In [4]:
def processTestDataset(source_test):
    new_source_test = []
    for data in source_test:
        new_data = data.split("code2comment :")[1].translate(str.maketrans({key: " {0} ".format(key) for key in punctuations}))
        new_source_test.append(new_data)
    return new_source_test

Process the raw data and transform into BOW vectors

In [5]:
source_train,target_train = processTrainDataset(train_dataset)
source_test = processTestDataset(source_test)

data_count_vect = CountVectorizer(max_df=0.5)
train_data_vect = data_count_vect.fit_transform(source_train)
test_data_vect = data_count_vect.transform(source_test)

Text similairty techniques: Gestalt Pattern Matching 

In [6]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [7]:
def predictionTopk(topk,similarity, similarity_time):
    print("processing:",topk)
    prediction = []  
    start_time = time.time()
    for index in range(len(similarity)):
        if index%1000 == 0:
            print("processing-instance ",index,"/16780")
#       find top-10 instances based on cosine distance
        index_nn = np.argpartition(similarity[index], -10)[-10:]
        similar_nn = []
        for idx in index_nn:
#       find best 10 candidates from top-10 instances based on text similarity score
            similar_score = similar(source_test[index], source_train[idx])
            similar_nn.append((idx, similar_score))
        similar_nn.sort(key=lambda x:x[1], reverse=True)
        similar_topk = similar_nn[:topk]
        current_prediction = []
        for element in similar_topk:
            current_prediction.append(target_train[element[0]])
        prediction.append(current_prediction)
    print(topk, " time cost:", time.time() - start_time + similarity_time,"s")
#   write the recommendation comments to the file named as "our_predictions_k.txt"
    with open(base+'our_predictions_'+str(topk) + '.txt', 'w') as f:
        for data in prediction:
            for element in data:
                f.write(element + '\n')

In [8]:
# Compute the cosine distance and its computational time
similarity_start_time = time.time()
similarity = cosine_similarity(test_data_vect, train_data_vect)
similarity_time = time.time() - similarity_start_time
# Compute the text similarity (GPM) and results
predictionTopk(1,similarity,similarity_time)
predictionTopk(3,similarity,similarity_time)
predictionTopk(5,similarity,similarity_time)
predictionTopk(10,similarity,similarity_time)

processing: 1
processing-instance  0 /16780
processing-instance  1000 /16780
processing-instance  2000 /16780
processing-instance  3000 /16780
processing-instance  4000 /16780
processing-instance  5000 /16780
processing-instance  6000 /16780
processing-instance  7000 /16780
processing-instance  8000 /16780
processing-instance  9000 /16780
processing-instance  10000 /16780
processing-instance  11000 /16780
processing-instance  12000 /16780
processing-instance  13000 /16780
processing-instance  14000 /16780
processing-instance  15000 /16780
processing-instance  16000 /16780
1  time cost: 240.7754499912262 s
processing: 3
processing-instance  0 /16780
processing-instance  1000 /16780
processing-instance  2000 /16780
processing-instance  3000 /16780
processing-instance  4000 /16780
processing-instance  5000 /16780
processing-instance  6000 /16780
processing-instance  7000 /16780
processing-instance  8000 /16780
processing-instance  9000 /16780
processing-instance  10000 /16780
processing-i

In [None]:
import statistics
from nltk.translate import bleu_score
from tqdm import tqdm
chencherry = bleu_score.SmoothingFunction()

# Evaluate perfect prediction & BLEU score of our approach
for k in [1, 3, 5, 10]:

    print('k candidates: ', k)
    path_targets = base + 'target.txt'
    path_predictions = base + 'our_predictions_' + str(k) + '.txt'

    tgt = [line.strip() for line in open(path_targets)]
    pred = [line.strip() for line in open(path_predictions)]

    count_perfect = 0
    BLEUscore = []
    for i in tqdm(range(len(tgt))):
        best_BLEU = 0
        target = tgt[i]
        for prediction in pred[i*k:i*k+k]:
            if " ".join(prediction.split()) == " ".join(target.split()):
                count_perfect += 1
                best_BLEU = bleu_score.sentence_bleu([target], prediction, smoothing_function=chencherry.method1)
                break
            current_BLEU = bleu_score.sentence_bleu([target], prediction, smoothing_function=chencherry.method1)
            if current_BLEU > best_BLEU:
                best_BLEU = current_BLEU
        BLEUscore.append(best_BLEU)

    print(f'PP    : %d/%d (%s%.2f)' % (count_perfect, len(tgt), '%', (count_perfect * 100) / len(tgt)))
    print(f'BLEU mean              : ', statistics.mean(BLEUscore))
    
    with open(base+"bleu_"+str(k) + '.txt', 'w') as fs:
        for bleu in BLEUscore:
            fs.write(str(bleu) + '\n')


k candidates:  1


100%|██████████| 16780/16780 [00:04<00:00, 4001.34it/s]


PP    : 470/16780 (%2.80)
BLEU mean              :  0.1241197518103609
k candidates:  3


100%|██████████| 16780/16780 [00:12<00:00, 1388.62it/s]


PP    : 566/16780 (%3.37)
BLEU mean              :  0.18330366896295064
k candidates:  5


100%|██████████| 16780/16780 [00:20<00:00, 834.90it/s]


PP    : 605/16780 (%3.61)
BLEU mean              :  0.20872034649762677
k candidates:  10


 67%|██████▋   | 11265/16780 [00:26<00:12, 432.31it/s]