In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import math
import time, pickle, math, warnings, os, operator
import string 
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate import bleu_score
import time

The following path could be changed to your own file path

In [2]:
base = './dataset/'
path_train = base+'train.tsv'
path_test_source = base+'source.txt'
path_test_target = base+'target.txt'

train_dataset = [line.strip() for line in open(path_train)]
source_test = [line.strip() for line in open(path_test_source)]
target_test = [line.strip() for line in open(path_test_target)]

punctuations = string.punctuation.replace("\"","")

In [3]:
def processTrainDataset(train_dataset):
    source_train = []
    target_train = []
    for data in train_dataset:
        data_group = data.split("\t")
        target = data_group[1]
        source = data_group[0].translate(str.maketrans({key: " {0} ".format(key) for key in punctuations}))
        source_train.append(source)
        target_train.append(target)
    return source_train,target_train

In [4]:
def processTestDataset(source_test):
    new_source_test = []
    for data in source_test:
        new_data = data.split("code2comment :")[1].translate(str.maketrans({key: " {0} ".format(key) for key in punctuations}))
        new_source_test.append(new_data)
    return new_source_test

Process the raw data and transform into BOW vectors

In [5]:
source_train,target_train = processTrainDataset(train_dataset)
source_test = processTestDataset(source_test)

data_count_vect = CountVectorizer(max_df=0.5)
train_data_vect = data_count_vect.fit_transform(source_train)
test_data_vect = data_count_vect.transform(source_test)

Text similairty techniques: Gestalt Pattern Matching 

In [6]:
from strsimpy.levenshtein import Levenshtein
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from strsimpy.damerau import Damerau
from strsimpy.jaro_winkler import JaroWinkler
from strsimpy.metric_lcs import MetricLCS

In [7]:
def predictionResultByTextSimilarity(similarity, algorithm, name, time):
    print("start predicting", name)
    topk = 10
    start_time = time.time()
    prediction = []
#     set variable algorithm as a selected text similarity technique
    algorithm = algorithm()   
    for index in range(len(similarity)):
        index_nn = np.argpartition(similarity[index], -topk)[-topk:]
        if index%1000 == 0:
            print("processing-instance ",index,"/16780")
        sq2sq_max_value = index_nn[0]
        sq2sq_best_score = algorithm.distance(source_test[0], source_train[0])
        for idx in index_nn:
            sq2sq_score = algorithm.distance(source_test[index], source_train[idx])
            if sq2sq_score < sq2sq_best_score:
                sq2sq_best_score = sq2sq_score
                sq2sq_max_value = idx
        prediction.append(target_train[sq2sq_max_value])
    pp = 0
    pp_comment = []
    for index in range(len(prediction)):
        if prediction[index] == target_test[index]:
            pp_comment.append({"index": index, "prediction":prediction[index], "target": target_test[index]})
            pp += 1
    print(name,":pp length is ", len(pp_comment), "time cost:", time.time() - start_time + time)

In [None]:
# RQ3 Compute the text similarity component and generate results perfect prediction & computational time when best-candidate=1. 

# Compute the cosine distance metric
cos_similarity_time = time.time()
similarity = cosine_similarity(test_data_vect, train_data_vect)
similarity_time = time.time() - cos_similarity_time

# The abbreviation of each technique for Figure 3 in the paper.
# LD + Cosine
predictionResultByTextSimilarity(similarity, Levenshtein, "Levenshtein",cos_similarity_time)
# NLD + Cosine
predictionResultByTextSimilarity(similarity, NormalizedLevenshtein, "NormalizedLevenshtein",cos_similarity_time)
# DLD + Cosine
predictionResultByTextSimilarity(similarity, Damerau, "Damerau",cos_similarity_time)
# JWD + Cosine
predictionResultByTextSimilarity(similarity, JaroWinkler, "JaroWinkler",cos_similarity_time)
# MLCS + Cosine
predictionResultByTextSimilarity(similarity, MetricLCS, "MetricLCS",cos_similarity_time)

In [6]:
# RQ3 Compute the distance metrics component and generate results perfect prediction & computational time when best-candidate=1. 
from difflib import SequenceMatcher
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def predictionResultByDistanceMetric(topk,similarity,name,distance_time):
    print("start predicting", name)
    prediction = []  
    start_time = time.time()
    for index in range(len(similarity)):
        if index%1000 == 0:
            print("processing-instance ",index,"/16780")
        index_nn = similarity[index].argsort()[:10]
        similar_nn = []
        for idx in index_nn:
            similar_score = similar(source_test[index], source_train[idx])
            similar_nn.append((idx, similar_score))
        similar_nn.sort(key=lambda x:x[1], reverse=True)
        similar_topk = similar_nn[:topk]
        current_prediction = []
        for element in similar_topk:
            current_prediction.append(target_train[element[0]])
        prediction.append(current_prediction[0])
    pp = 0
    pp_comment = []
    for index in range(len(prediction)):
        if prediction[index] == target_test[index]:
            pp_comment.append({"index": index, "prediction":prediction[index], "target": target_test[index]})
            pp += 1
    print(name,":pp length is ", len(pp_comment), "time cost:", time.time() - start_time + distance_time)

In [7]:
# Compute Euclidean distance + GPM
distance_start_time = time.time()
similarity = euclidean_distances(test_data_vect,train_data_vect)
distance_time = time.time()-distance_start_time

predictionResultByDistanceMetric(1,similarity,"euc",distance_time)

start predicting euc
processing-instance  0 /16780
processing-instance  1000 /16780
processing-instance  2000 /16780
processing-instance  3000 /16780
processing-instance  4000 /16780
processing-instance  5000 /16780
processing-instance  6000 /16780
processing-instance  7000 /16780
processing-instance  8000 /16780
processing-instance  9000 /16780
processing-instance  10000 /16780
processing-instance  11000 /16780
processing-instance  12000 /16780
processing-instance  13000 /16780
processing-instance  14000 /16780
processing-instance  15000 /16780
processing-instance  16000 /16780
euc :pp length is  414 time cost: 394.04774808883667


In [8]:
# Compute Manhattan distance + GPM
distance_start_time = time.time()
similarity = manhattan_distances(test_data_vect,train_data_vect)
distance_time = time.time()-distance_start_time

predictionResultByDistanceMetric(1,similarity,"manh",distance_time)

start predicting manh
processing-instance  0 /16780
processing-instance  1000 /16780
processing-instance  2000 /16780
processing-instance  3000 /16780
processing-instance  4000 /16780
processing-instance  5000 /16780
processing-instance  6000 /16780
processing-instance  7000 /16780
processing-instance  8000 /16780
processing-instance  9000 /16780
processing-instance  10000 /16780
processing-instance  11000 /16780
processing-instance  12000 /16780
processing-instance  13000 /16780
processing-instance  14000 /16780
processing-instance  15000 /16780
processing-instance  16000 /16780
manh :pp length is  450 time cost: 392.708637714386
