# What is the impact of different dataset splitting strategies?

In [1]:
import pandas as pd
import os,json

model_name_list = ["CommitGen","CoDiSum","NMT","NNGen","PtrGNCMsg"]
lan_list = ["java","csharp","cpp","python","javascript"]

## Split by timestamp

### MCMD<sub>Java</sub>

In [2]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=model_name_list)
for model_name in model_name_list:
    dataset_name = "MCMD/{}/split_time".format(lan_list[0])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    gen_path = "../experiment_results/model_{}/data_{}/gen.msg".format(model_name,dataset_name)
    BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
    results['B-Moses'][model_name] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
    BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
    results['B-Norm'][model_name] = float(os.popen(BN_evaluate_cmd).read())
    BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['B-CC'][model_name] = float(os.popen(BC_evaluate_cmd).read().strip())
    Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
    rouge_dict = json.loads(rouge_str)
    results['Rouge-1'][model_name], results['Rouge-2'][model_name], results['Rouge-L'][model_name] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
    Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['Meteor'][model_name] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
CommitGen,3.61,8.08,4.95,8.8,4.03,8.74,9.23
CoDiSum,1.64,12.71,4.97,14.94,4.09,14.72,10.17
NMT,6.09,9.5,7.47,11.21,4.75,11.13,13.86
NNGen,6.03,10.73,8.26,11.97,4.19,11.57,14.34
PtrGNCMsg,7.68,13.3,10.2,16.15,5.93,15.71,17.55


### MCMD<sub>C#</sub>

In [3]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=model_name_list)
for model_name in model_name_list:
    dataset_name = "MCMD/{}/split_time".format(lan_list[1])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    gen_path = "../experiment_results/model_{}/data_{}/gen.msg".format(model_name,dataset_name)
    BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
    results['B-Moses'][model_name] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
    BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
    results['B-Norm'][model_name] = float(os.popen(BN_evaluate_cmd).read())
    BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['B-CC'][model_name] = float(os.popen(BC_evaluate_cmd).read().strip())
    Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
    rouge_dict = json.loads(rouge_str)
    results['Rouge-1'][model_name], results['Rouge-2'][model_name], results['Rouge-L'][model_name] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
    Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['Meteor'][model_name] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
CommitGen,2.84,4.53,2.7,7.07,3.86,7.04,5.97
CoDiSum,0.05,4.85,1.42,5.52,0.32,5.5,3.31
NMT,4.15,5.15,3.46,8.43,2.35,8.42,7.71
NNGen,5.33,3.08,5.38,8.86,3.5,8.67,10.45
PtrGNCMsg,6.54,3.28,6.27,11.78,4.36,11.55,12.18


### MCMD<sub>C++</sub>

In [4]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=model_name_list)
for model_name in model_name_list:
    dataset_name = "MCMD/{}/split_time".format(lan_list[2])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    gen_path = "../experiment_results/model_{}/data_{}/gen.msg".format(model_name,dataset_name)
    BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
    results['B-Moses'][model_name] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
    BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
    results['B-Norm'][model_name] = float(os.popen(BN_evaluate_cmd).read())
    BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['B-CC'][model_name] = float(os.popen(BC_evaluate_cmd).read().strip())
    Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
    rouge_dict = json.loads(rouge_str)
    results['Rouge-1'][model_name], results['Rouge-2'][model_name], results['Rouge-L'][model_name] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
    Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['Meteor'][model_name] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
CommitGen,3.88,7.08,5.3,9.84,6.21,9.8,8.96
CoDiSum,4.31,12.24,5.99,15.1,4.8,14.78,10.29
NMT,5.48,8.53,6.2,10.62,5.8,10.6,11.1
NNGen,4.81,9.3,6.92,10.83,4.49,10.53,12.29
PtrGNCMsg,4.62,10.94,7.91,13.93,5.34,13.5,14.33


### MCMD<sub>Python</sub>

In [5]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=model_name_list)
for model_name in model_name_list:
    dataset_name = "MCMD/{}/split_time".format(lan_list[3])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    gen_path = "../experiment_results/model_{}/data_{}/gen.msg".format(model_name,dataset_name)
    BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
    results['B-Moses'][model_name] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
    BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
    results['B-Norm'][model_name] = float(os.popen(BN_evaluate_cmd).read())
    BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['B-CC'][model_name] = float(os.popen(BC_evaluate_cmd).read().strip())
    Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
    rouge_dict = json.loads(rouge_str)
    results['Rouge-1'][model_name], results['Rouge-2'][model_name], results['Rouge-L'][model_name] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
    Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['Meteor'][model_name] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
CommitGen,2.83,5.5,3.88,7.62,4.64,7.6,6.71
CoDiSum,1.23,12.46,4.62,16.66,4.48,16.19,10.06
NMT,4.87,7.31,6.17,8.45,5.06,8.42,11.17
NNGen,3.47,9.36,6.45,10.11,2.88,9.73,13.87
PtrGNCMsg,6.55,13.21,9.8,17.7,6.54,17.02,20.01


### MCMD<sub>JavaScript</sub>

In [6]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=model_name_list)
for model_name in model_name_list:
    dataset_name = "MCMD/{}/split_time".format(lan_list[4])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    gen_path = "../experiment_results/model_{}/data_{}/gen.msg".format(model_name,dataset_name)
    BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
    results['B-Moses'][model_name] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
    BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
    results['B-Norm'][model_name] = float(os.popen(BN_evaluate_cmd).read())
    BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['B-CC'][model_name] = float(os.popen(BC_evaluate_cmd).read().strip())
    Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
    rouge_dict = json.loads(rouge_str)
    results['Rouge-1'][model_name], results['Rouge-2'][model_name], results['Rouge-L'][model_name] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
    Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['Meteor'][model_name] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
CommitGen,5.02,8.91,6.36,12.25,6.21,12.05,11.13
CoDiSum,1.16,11.17,3.62,12.74,1.97,12.43,7.81
NMT,7.89,11.58,9.61,14.58,7.87,14.44,15.54
NNGen,6.97,12.07,9.35,13.54,5.65,13.07,16.89
PtrGNCMsg,13.49,18.07,15.06,23.74,12.58,23.1,23.98


## Split by project

### MCMD<sub>Java</sub>

In [7]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=model_name_list)
for model_name in model_name_list:
    dataset_name = "MCMD/{}/split_repo".format(lan_list[0])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    gen_path = "../experiment_results/model_{}/data_{}/gen.msg".format(model_name,dataset_name)
    BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
    results['B-Moses'][model_name] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
    BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
    results['B-Norm'][model_name] = float(os.popen(BN_evaluate_cmd).read())
    BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['B-CC'][model_name] = float(os.popen(BC_evaluate_cmd).read().strip())
    Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
    rouge_dict = json.loads(rouge_str)
    results['Rouge-1'][model_name], results['Rouge-2'][model_name], results['Rouge-L'][model_name] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
    Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['Meteor'][model_name] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
CommitGen,1.29,5.2,6.01,7.54,3.21,7.42,10.62
CoDiSum,1.7,10.23,4.04,12.65,2.75,12.47,8.29
NMT,8.49,7.94,7.25,11.41,7.86,11.37,10.51
NNGen,3.38,5.67,4.05,5.78,2.08,5.63,6.48
PtrGNCMsg,2.24,7.92,5.23,10.97,3.75,10.67,9.41


### MCMD<sub>C#</sub>

In [8]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=model_name_list)
for model_name in model_name_list:
    dataset_name = "MCMD/{}/split_repo".format(lan_list[1])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    gen_path = "../experiment_results/model_{}/data_{}/gen.msg".format(model_name,dataset_name)
    BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
    results['B-Moses'][model_name] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
    BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
    results['B-Norm'][model_name] = float(os.popen(BN_evaluate_cmd).read())
    BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['B-CC'][model_name] = float(os.popen(BC_evaluate_cmd).read().strip())
    Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
    rouge_dict = json.loads(rouge_str)
    results['Rouge-1'][model_name], results['Rouge-2'][model_name], results['Rouge-L'][model_name] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
    Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['Meteor'][model_name] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
CommitGen,2.32,4.82,3.79,5.39,2.18,5.29,6.95
CoDiSum,1.03,8.43,2.91,9.74,1.96,9.57,6.44
NMT,8.41,5.95,4.94,8.22,4.25,8.19,7.92
NNGen,10.01,9.89,8.78,10.39,6.92,10.3,11.05
PtrGNCMsg,8.69,8.08,5.35,10.17,3.93,9.99,9.63


### MCMD<sub>C++</sub>

In [9]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=model_name_list)
for model_name in model_name_list:
    dataset_name = "MCMD/{}/split_repo".format(lan_list[2])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    gen_path = "../experiment_results/model_{}/data_{}/gen.msg".format(model_name,dataset_name)
    BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
    results['B-Moses'][model_name] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
    BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
    results['B-Norm'][model_name] = float(os.popen(BN_evaluate_cmd).read())
    BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['B-CC'][model_name] = float(os.popen(BC_evaluate_cmd).read().strip())
    Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
    rouge_dict = json.loads(rouge_str)
    results['Rouge-1'][model_name], results['Rouge-2'][model_name], results['Rouge-L'][model_name] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
    Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['Meteor'][model_name] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
CommitGen,2.28,4.47,3.5,4.9,3.12,4.89,5.15
CoDiSum,0.89,2.87,1.67,3.67,1.14,3.6,2.61
NMT,2.86,5.73,4.11,7.71,5.45,7.7,7.36
NNGen,0.88,3.9,2.3,3.89,0.68,3.79,4.91
PtrGNCMsg,1.52,6.29,3.84,8.9,2.43,8.61,8.03


### MCMD<sub>Python</sub>

In [10]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=model_name_list)
for model_name in model_name_list:
    dataset_name = "MCMD/{}/split_repo".format(lan_list[3])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    gen_path = "../experiment_results/model_{}/data_{}/gen.msg".format(model_name,dataset_name)
    BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
    results['B-Moses'][model_name] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
    BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
    results['B-Norm'][model_name] = float(os.popen(BN_evaluate_cmd).read())
    BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['B-CC'][model_name] = float(os.popen(BC_evaluate_cmd).read().strip())
    Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
    rouge_dict = json.loads(rouge_str)
    results['Rouge-1'][model_name], results['Rouge-2'][model_name], results['Rouge-L'][model_name] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
    Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['Meteor'][model_name] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
CommitGen,4.02,7.61,4.55,7.9,4.26,7.88,9.31
CoDiSum,2.51,9.23,3.99,12.24,3.59,11.92,7.35
NMT,4.67,5.29,4.94,8.58,5.45,8.53,7.99
NNGen,0.92,4.66,2.6,4.8,0.88,4.69,5.4
PtrGNCMsg,2.42,8.79,5.17,13.46,4.85,13.03,10.93


### MCMD<sub>JavaScript</sub>

In [11]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=model_name_list)
for model_name in model_name_list:
    dataset_name = "MCMD/{}/split_repo".format(lan_list[4])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    gen_path = "../experiment_results/model_{}/data_{}/gen.msg".format(model_name,dataset_name)
    BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
    results['B-Moses'][model_name] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
    BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
    results['B-Norm'][model_name] = float(os.popen(BN_evaluate_cmd).read())
    BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['B-CC'][model_name] = float(os.popen(BC_evaluate_cmd).read().strip())
    Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
    rouge_dict = json.loads(rouge_str)
    results['Rouge-1'][model_name], results['Rouge-2'][model_name], results['Rouge-L'][model_name] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
    Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
    results['Meteor'][model_name] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
CommitGen,4.04,7.05,6.44,9.16,5.88,9.06,10.46
CoDiSum,3.63,8.02,4.59,10.77,3.78,10.61,7.54
NMT,7.12,7.39,7.09,11.1,7.6,11.01,11.14
NNGen,2.39,5.72,3.74,6.35,1.54,6.13,7.55
PtrGNCMsg,5.95,11.99,8.63,16.64,6.82,16.12,15.77


## NMT performance on one repo of MCMD 

### MCMD<sub>Java</sub>

In [12]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=["in","cross", "full"])
for model_name in model_name_list[2:3]:
    dataset_name = "MCMD/{}/single_repo".format(lan_list[0])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    for setting in ["in","cross", "full"]:
        gen_path = "../experiment_results/model_{}/data_{}/{}.gen.msg".format(model_name,dataset_name,setting)
        BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
        results['B-Moses'][setting] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
        BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
        results['B-Norm'][setting] = float(os.popen(BN_evaluate_cmd).read())
        BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        results['B-CC'][setting] = float(os.popen(BC_evaluate_cmd).read().strip())
        Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
        rouge_dict = json.loads(rouge_str)
        results['Rouge-1'][setting], results['Rouge-2'][setting], results['Rouge-L'][setting] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
        Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        results['Meteor'][setting] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
in,3.79,9.35,6.74,9.62,2.44,9.32,13.02
cross,0.0,6.27,3.89,1.52,0.02,1.49,8.33
full,4.54,10.28,7.52,10.9,2.9,10.59,14.66


### MCMD<sub>C#</sub>

In [13]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=["in","cross", "full"])
for model_name in model_name_list[2:3]:
    dataset_name = "MCMD/{}/single_repo".format(lan_list[1])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    for setting in ["in","cross", "full"]:
        gen_path = "../experiment_results/model_{}/data_{}/{}.gen.msg".format(model_name,dataset_name,setting)
        BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
        results['B-Moses'][setting] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
        BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
        results['B-Norm'][setting] = float(os.popen(BN_evaluate_cmd).read())
        BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        results['B-CC'][setting] = float(os.popen(BC_evaluate_cmd).read().strip())
        Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
        rouge_dict = json.loads(rouge_str)
        results['Rouge-1'][setting], results['Rouge-2'][setting], results['Rouge-L'][setting] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
        Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        results['Meteor'][setting] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
in,24.27,28.82,30.11,30.19,21.12,30.09,36.0
cross,0.0,6.48,3.17,1.58,0.0,1.55,6.72
full,35.99,40.04,42.52,39.69,35.47,39.6,42.83


### MCMD<sub>C++</sub>

In [14]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=["in","cross", "full"])
for model_name in model_name_list[2:3]:
    dataset_name = "MCMD/{}/single_repo".format(lan_list[2])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    for setting in ["in","cross", "full"]:
        gen_path = "../experiment_results/model_{}/data_{}/{}.gen.msg".format(model_name,dataset_name,setting)
        BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
        results['B-Moses'][setting] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
        BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
        results['B-Norm'][setting] = float(os.popen(BN_evaluate_cmd).read())
        BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        results['B-CC'][setting] = float(os.popen(BC_evaluate_cmd).read().strip())
        Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
        rouge_dict = json.loads(rouge_str)
        results['Rouge-1'][setting], results['Rouge-2'][setting], results['Rouge-L'][setting] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
        Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        results['Meteor'][setting] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
in,3.87,8.47,7.41,12.28,7.43,12.14,11.4
cross,0.11,2.08,0.76,0.54,0.07,0.54,1.62
full,3.72,8.45,7.38,13.1,8.96,13.03,11.0


### MCMD<sub>Python</sub>

In [15]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=["in","cross", "full"])
for model_name in model_name_list[2:3]:
    dataset_name = "MCMD/{}/single_repo".format(lan_list[3])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    for setting in ["in","cross", "full"]:
        gen_path = "../experiment_results/model_{}/data_{}/{}.gen.msg".format(model_name,dataset_name,setting)
        BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
        results['B-Moses'][setting] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
        BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
        results['B-Norm'][setting] = float(os.popen(BN_evaluate_cmd).read())
        BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        results['B-CC'][setting] = float(os.popen(BC_evaluate_cmd).read().strip())
        Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
        rouge_dict = json.loads(rouge_str)
        results['Rouge-1'][setting], results['Rouge-2'][setting], results['Rouge-L'][setting] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
        Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        results['Meteor'][setting] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
in,1.43,7.28,4.96,7.2,1.77,7.0,9.18
cross,0.0,1.5,0.49,0.49,0.0,0.48,0.64
full,1.33,7.41,4.69,7.21,1.69,7.0,8.96


### MCMD<sub>JavaScript</sub>

In [16]:
results = pd.DataFrame(columns=['B-Moses', 'B-Norm', 'B-CC', 'Rouge-1', 'Rouge-2', 'Rouge-L', 'Meteor'], index=["in","cross", "full"])
for model_name in model_name_list[2:3]:
    dataset_name = "MCMD/{}/single_repo".format(lan_list[4])
    ref_path = "../experiment_results/model_{}/data_{}/ref.msg".format(model_name,dataset_name)
    for setting in ["in","cross", "full"]:
        gen_path = "../experiment_results/model_{}/data_{}/{}.gen.msg".format(model_name,dataset_name,setting)
        BM_evaluate_cmd = "../metrics/B-Moses.perl {} < {}".format(ref_path, gen_path)
        results['B-Moses'][setting] = float(os.popen(BM_evaluate_cmd).read().split("=")[1].split(",")[0].strip())
        BN_evaluate_cmd = "python ../metrics/B-Norm.py {} < {}".format(ref_path, gen_path)
        results['B-Norm'][setting] = float(os.popen(BN_evaluate_cmd).read())
        BC_evaluate_cmd = "python ../metrics/B-CC.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        results['B-CC'][setting] = float(os.popen(BC_evaluate_cmd).read().strip())
        Rouge_evaluate_cmd = "python ../metrics/Rouge.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        rouge_str = os.popen(Rouge_evaluate_cmd).read().replace("'", "\"")
        rouge_dict = json.loads(rouge_str)
        results['Rouge-1'][setting], results['Rouge-2'][setting], results['Rouge-L'][setting] = rouge_dict['ROUGE-1'], rouge_dict['ROUGE-2'], rouge_dict['ROUGE-L']
        Meteor_evaluate_cmd = "python ../metrics/Meteor.py --ref_path {} --gen_path {}".format(ref_path, gen_path)
        results['Meteor'][setting] = float(os.popen(Meteor_evaluate_cmd).read().strip())
pd.options.display.float_format = '{:.2f}'.format
results

Unnamed: 0,B-Moses,B-Norm,B-CC,Rouge-1,Rouge-2,Rouge-L,Meteor
in,13.6,18.33,17.1,20.84,15.03,20.67,20.56
cross,0.0,2.23,0.56,1.24,0.0,1.23,1.47
full,13.98,18.92,17.69,21.42,15.77,21.25,21.07
