In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score  
from sklearn.metrics import recall_score  
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.svm import OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN 
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler 
import openpyxl

In [2]:
def get_the_predicted_bugs(name, num1, num2):
    All_Data = pd.read_csv(name, usecols = [i for i in range(num1, num2)]).values
    return All_Data

In [3]:
data_Eclipse_JDT_Core = get_the_predicted_bugs('Eclipse_JDT_Core_all_data.csv', 2, 45)
data_Eclipse_PDE_UI = get_the_predicted_bugs('Eclipse_PDE_UI_all_data.csv', 2, 45)
data_Equinox_Framework = get_the_predicted_bugs('Equinox_Framework_all_data.csv', 2, 45)
data_Lucene = get_the_predicted_bugs('Lucene_all_data.csv', 2, 45)
data_Mylyn = get_the_predicted_bugs('Mylyn_all_data.csv', 2, 45)
dic = {'Eclipse_JDT_Core' : data_Eclipse_JDT_Core,
       'Eclipse_PDE_UI' : data_Eclipse_PDE_UI,
       'Equinox_Framework' : data_Equinox_Framework,
       'Lucene' : data_Lucene,
       'Mylyn' : data_Mylyn}

## Try different algorithms

In [4]:
def change_to_binary(x):
    for i in range(len(x)):
        if x[i] == 0:
            continue
        elif x[i] > 0:
            x[i] = 1
        else:
            print('Wrong.')
            return
    return x

In [5]:
def no_train_data_resample(a, b):
    return a, b

In [6]:
def SMOTE_train_data_resample(a, b):
    return SMOTE().fit_resample(a, b)

In [7]:
def SMOTEENN_train_data_resample(a, b):
    return SMOTEENN().fit_resample(a, b)

In [8]:
def RandomOverSampler_train_data_resample(a, b):
    return RandomOverSampler(random_state = 0).fit_resample(a, b)

In [9]:
def train_test_same_version(data, times, method, train_data_resample):
    accuracy_score_list = []
    recall_score_list = []
    precision_score_list = []
    f1_score_list = []
    for i in range(times):
        data_random = np.random.permutation(data)
        x = data_random[:, 0 : -1]
        y = change_to_binary(data_random[:, -1])
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
        
        ##
        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        scaler.fit(x_test)
        x_test = scaler.transform(x_test)
        ##
        
        x_train_new, y_train_new = train_data_resample(x_train, y_train)
        clf = method
        clf.fit(x_train_new, y_train_new)
        y_pre = clf.predict(x_test)
        accuracy_score_list.append(accuracy_score(y_test, y_pre))
        recall_score_list.append(recall_score(y_test, y_pre))
        precision_score_list.append(precision_score(y_test, y_pre))
        f1_score_list.append(f1_score(y_test, y_pre))
    return [accuracy_score_list, recall_score_list, precision_score_list, f1_score_list, 
            np.mean(accuracy_score_list), np.mean(recall_score_list), 
            np.mean(precision_score_list), np.mean(f1_score_list),
            np.mean([np.mean(accuracy_score_list), np.mean(recall_score_list), np.mean(precision_score_list), np.mean(f1_score_list)])]

In [10]:
def train_test_diff_version(data1, data2, times, method, train_data_resample):
    accuracy_score_list = []
    recall_score_list = []
    precision_score_list = []
    f1_score_list = []
    for i in range(times):
        data1_random = np.random.permutation(data1)
        data2_random = np.random.permutation(data2)
        x_train = data1_random[:, 0 : -1]
        y_train = change_to_binary(data1_random[:, -1])
        x_test = data2_random[:, 0 : -1]
        y_test = change_to_binary(data2_random[:, -1])
        
        ##
        scaler = StandardScaler()
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        scaler.fit(x_test)
        x_test = scaler.transform(x_test)
        ##
        
        x_train_new, y_train_new = train_data_resample(x_train, y_train)  
        clf = method
        clf.fit(x_train_new, y_train_new)
        y_pre = clf.predict(x_test)
        accuracy_score_list.append(accuracy_score(y_test, y_pre))
        recall_score_list.append(recall_score(y_test, y_pre))
        precision_score_list.append(precision_score(y_test, y_pre))
        f1_score_list.append(f1_score(y_test, y_pre))
    return [accuracy_score_list, recall_score_list, precision_score_list, f1_score_list, 
            np.mean(accuracy_score_list), np.mean(recall_score_list), 
            np.mean(precision_score_list), np.mean(f1_score_list),
            np.mean([np.mean(accuracy_score_list), np.mean(recall_score_list), np.mean(precision_score_list), np.mean(f1_score_list)])]

## LogisticRegression

In [11]:
wb = openpyxl.Workbook()
ws1 = wb.create_sheet("LogisticRegression")    
ws1.cell(row = 1, column = 1).value ='train set'
ws1.cell(row = 1, column = 2).value ='test set'
ws1.cell(row = 1, column = 3).value ='accuracy score'
ws1.cell(row = 1, column = 4).value ='recall score'
ws1.cell(row = 1, column = 5).value ='precision score'
ws1.cell(row = 1, column = 6).value ='f1 score'
ws1.cell(row = 1, column = 7).value ='0.25(Accuracy+Recall+Precision+F1)'

for n1, i in enumerate(dic):
    for n2, j in enumerate(dic):
        if i == j:
            tmp1 = train_test_same_version(dic[i], 10, LogisticRegression(solver = 'liblinear', max_iter = 1000, class_weight = 'balanced'), no_train_data_resample)
            print(i, j)
            print(tmp1[4:])
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 1).value = i
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 2).value = j
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 3).value = tmp1[4]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 4).value = tmp1[5]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 5).value = tmp1[6]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 6).value = tmp1[7]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 7).value = tmp1[8]
        else:
            tmp2 = train_test_diff_version(dic[i], dic[j], 10, LogisticRegression(solver = 'liblinear', max_iter = 1000, class_weight = 'balanced'), no_train_data_resample)
            print(i, j)
            print(tmp2[4:])
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 1).value = i
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 2).value = j
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 3).value = tmp2[4]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 4).value = tmp2[5]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 5).value = tmp2[6]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 6).value = tmp2[7]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 7).value = tmp2[8]

wb.save("result.xlsx")

Eclipse_JDT_Core Eclipse_JDT_Core
[0.7975, 0.6483840640462876, 0.5378042196311597, 0.585533456881284, 0.6423054351396829]
Eclipse_JDT_Core Eclipse_PDE_UI
[0.72812291249165, 0.5550239234449761, 0.2697674418604651, 0.36306729264475746, 0.4789953926104621]
Eclipse_JDT_Core Equinox_Framework
[0.6913580246913579, 0.40310077519379844, 0.6933333333333334, 0.5098039215686274, 0.5743990136967793]
Eclipse_JDT_Core Lucene
[0.7221418234442838, 0.65625, 0.19811320754716982, 0.30434782608695654, 0.47021321426960255]
Eclipse_JDT_Core Mylyn
[0.7062298603651989, 0.5306122448979592, 0.23131672597864766, 0.322180916976456, 0.4475849370545654]
Eclipse_PDE_UI Eclipse_JDT_Core
[0.7632898696088264, 0.5776699029126213, 0.44402985074626866, 0.5021097046413503, 0.5717748319772666]
Eclipse_PDE_UI Eclipse_PDE_UI
[0.7383333333333334, 0.5876540426198552, 0.2942175564509385, 0.3914291288986057, 0.5029085153256831]
Eclipse_PDE_UI Equinox_Framework
[0.6913580246913579, 0.4108527131782946, 0.6883116883116883, 0.5145631

## MLPClassifier with RandomOverSampler

In [12]:
wb = openpyxl.load_workbook('result.xlsx')
ws1 = wb.create_sheet("MLP with OverSample")    
ws1.cell(row = 1, column = 1).value ='train set'
ws1.cell(row = 1, column = 2).value ='test set'
ws1.cell(row = 1, column = 3).value ='accuracy score'
ws1.cell(row = 1, column = 4).value ='recall score'
ws1.cell(row = 1, column = 5).value ='precision score'
ws1.cell(row = 1, column = 6).value ='f1 score'
ws1.cell(row = 1, column = 7).value ='0.25(Accuracy+Recall+Precision+F1)'

for n1, i in enumerate(dic):
    for n2, j in enumerate(dic):
        if i == j:
            tmp1 = train_test_same_version(dic[i], 10, MLPClassifier(solver = 'lbfgs', alpha = 1e-5, hidden_layer_sizes = (100, 100)), RandomOverSampler_train_data_resample)
            print(i, j)
            print(tmp1[4:])
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 1).value = i
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 2).value = j
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 3).value = tmp1[4]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 4).value = tmp1[5]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 5).value = tmp1[6]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 6).value = tmp1[7]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 7).value = tmp1[8]
        else:
            tmp2 = train_test_same_version(dic[i], 10, MLPClassifier(solver = 'lbfgs', alpha = 1e-5, hidden_layer_sizes = (100, 100)), RandomOverSampler_train_data_resample)
            print(i, j)
            print(tmp2[4:])
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 1).value = i
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 2).value = j
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 3).value = tmp2[4]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 4).value = tmp2[5]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 5).value = tmp2[6]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 6).value = tmp2[7]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 7).value = tmp2[8]

wb.save("result.xlsx")

Eclipse_JDT_Core Eclipse_JDT_Core
[0.8009999999999999, 0.5395853427808136, 0.5572785339519619, 0.5417842460337974, 0.6099120306916432]
Eclipse_JDT_Core Eclipse_PDE_UI
[0.8205, 0.5423524942595697, 0.5489717440650008, 0.5417297729488821, 0.6133885028183632]
Eclipse_JDT_Core Equinox_Framework
[0.8130000000000001, 0.5894711113704367, 0.5897892604915561, 0.5819209244938405, 0.6435453240889583]
Eclipse_JDT_Core Lucene
[0.788, 0.5186468941035856, 0.5001655894765393, 0.507223166747973, 0.5785089125820244]
Eclipse_JDT_Core Mylyn
[0.7965000000000001, 0.5621662639685895, 0.5113030104452098, 0.5340986171690774, 0.6010169728957192]
Eclipse_PDE_UI Eclipse_JDT_Core
[0.8009999999999999, 0.3367782645189964, 0.3075603322581064, 0.3144195281577081, 0.4399395312337027]
Eclipse_PDE_UI Eclipse_PDE_UI
[0.7896666666666666, 0.3288676495978867, 0.280979558392148, 0.30169651526353514, 0.4253025974800591]
Eclipse_PDE_UI Equinox_Framework
[0.7936666666666666, 0.40147962143158356, 0.30183824310877966, 0.33807385893

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Mylyn Eclipse_JDT_Core
[0.7219839142091152, 0.40449111273519145, 0.29105391923227564, 0.28751161168533396, 0.4262601394654791]
Mylyn Eclipse_PDE_UI
[0.6319034852546916, 0.40151736949983763, 0.2494360518523445, 0.24950565737334865, 0.38309064099505563]
Mylyn Equinox_Framework
[0.7184986595174262, 0.3358164441495409, 0.22874006797907026, 0.24610627630873574, 0.3822903619886933]
Mylyn Lucene
[0.6345844504021447, 0.3848964528059035, 0.16615595433858749, 0.21626399338102525, 0.3504752127319153]
Mylyn Mylyn
[0.6951742627345844, 0.3431793260125301, 0.27057951295641725, 0.2679155688392983, 0.3942121676357075]


## RandomForest with RandomOverSampler

In [13]:
wb = openpyxl.load_workbook('result.xlsx')
ws1 = wb.create_sheet("RF with OverSample")    
ws1.cell(row = 1, column = 1).value ='train set'
ws1.cell(row = 1, column = 2).value ='test set'
ws1.cell(row = 1, column = 3).value ='accuracy score'
ws1.cell(row = 1, column = 4).value ='recall score'
ws1.cell(row = 1, column = 5).value ='precision score'
ws1.cell(row = 1, column = 6).value ='f1 score'
ws1.cell(row = 1, column = 7).value ='0.25(Accuracy+Recall+Precision+F1)'

for n1, i in enumerate(dic):
    for n2, j in enumerate(dic):
        if i == j:
            tmp1 = train_test_same_version(dic[i], 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'), RandomOverSampler_train_data_resample)
            print(i, j)
            print(tmp1[4:])
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 1).value = i
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 2).value = j
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 3).value = tmp1[4]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 4).value = tmp1[5]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 5).value = tmp1[6]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 6).value = tmp1[7]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 7).value = tmp1[8]
        else:
            tmp2 = train_test_same_version(dic[i], 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'), RandomOverSampler_train_data_resample)
            print(i, j)
            print(tmp2[4:])
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 1).value = i
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 2).value = j
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 3).value = tmp2[4]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 4).value = tmp2[5]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 5).value = tmp2[6]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 6).value = tmp2[7]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 7).value = tmp2[8]

wb.save("result.xlsx")

Eclipse_JDT_Core Eclipse_JDT_Core
[0.8545, 0.5668764959183883, 0.7192274092201062, 0.6296454593759491, 0.6925623411286108]
Eclipse_JDT_Core Eclipse_PDE_UI
[0.8574999999999999, 0.559179134475326, 0.7106758993077125, 0.6215448841882549, 0.6872249794928234]
Eclipse_JDT_Core Equinox_Framework
[0.8535, 0.5425608609606891, 0.6988812228063795, 0.6067452823075075, 0.6754218415186442]
Eclipse_JDT_Core Lucene
[0.86, 0.5799618612008337, 0.7029630236591188, 0.6329824178119695, 0.6939768256679805]
Eclipse_JDT_Core Mylyn
[0.851, 0.5363244258708934, 0.7094135749308162, 0.6079542745485963, 0.6761730688375764]
Eclipse_PDE_UI Eclipse_JDT_Core
[0.8700000000000001, 0.26973775017253276, 0.61638701700776, 0.36914415483406837, 0.5313172305035903]
Eclipse_PDE_UI Eclipse_PDE_UI
[0.8693333333333333, 0.26961044651960514, 0.5811230150475002, 0.3664455653981793, 0.5216280900746545]
Eclipse_PDE_UI Equinox_Framework
[0.8653333333333333, 0.2345142185163393, 0.5777611531616108, 0.32997588638519354, 0.5018961478491192]

## RandomForest with SMOTEENN

In [14]:
wb = openpyxl.load_workbook('result.xlsx')
ws1 = wb.create_sheet("RF with SMOTEENN")    
ws1.cell(row = 1, column = 1).value ='train set'
ws1.cell(row = 1, column = 2).value ='test set'
ws1.cell(row = 1, column = 3).value ='accuracy score'
ws1.cell(row = 1, column = 4).value ='recall score'
ws1.cell(row = 1, column = 5).value ='precision score'
ws1.cell(row = 1, column = 6).value ='f1 score'
ws1.cell(row = 1, column = 7).value ='0.25(Accuracy+Recall+Precision+F1)'

for n1, i in enumerate(dic):
    for n2, j in enumerate(dic):
        if i == j:
            tmp1 = train_test_same_version(dic[i], 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'), SMOTEENN_train_data_resample)
            print(i, j)
            print(tmp1[4:])
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 1).value = i
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 2).value = j
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 3).value = tmp1[4]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 4).value = tmp1[5]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 5).value = tmp1[6]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 6).value = tmp1[7]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 7).value = tmp1[8]
        else:
            tmp2 = train_test_same_version(dic[i], 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'), SMOTEENN_train_data_resample)
            print(i, j)
            print(tmp2[4:])
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 1).value = i
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 2).value = j
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 3).value = tmp2[4]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 4).value = tmp2[5]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 5).value = tmp2[6]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 6).value = tmp2[7]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 7).value = tmp2[8]

wb.save("result.xlsx")

Eclipse_JDT_Core Eclipse_JDT_Core
[0.7795, 0.779454294279846, 0.4741137081518286, 0.5864285733508326, 0.6548741439456268]
Eclipse_JDT_Core Eclipse_PDE_UI
[0.782, 0.7505944478312899, 0.4759622715659172, 0.580553834797118, 0.6472776385485812]
Eclipse_JDT_Core Equinox_Framework
[0.7869999999999999, 0.7456545898815989, 0.5030132493418144, 0.5940094144585232, 0.6574193134204841]
Eclipse_JDT_Core Lucene
[0.7789999999999999, 0.7654648537329868, 0.4700438130214426, 0.5810104615312911, 0.64887978207143]
Eclipse_JDT_Core Mylyn
[0.784, 0.7685986237193381, 0.5069612663954082, 0.6083593223216547, 0.6669798031091003]
Eclipse_PDE_UI Eclipse_JDT_Core
[0.6716666666666666, 0.7174698563810827, 0.27238600431202603, 0.39346017687277435, 0.5137456760581374]
Eclipse_PDE_UI Eclipse_PDE_UI
[0.679, 0.6838411715557604, 0.26306998226656375, 0.3777785006085469, 0.5009224136077177]
Eclipse_PDE_UI Equinox_Framework
[0.6733333333333333, 0.7342323710424384, 0.25519721566665815, 0.37650845009462786, 0.5098178425342645]

## RandomForest with SMOTE

In [15]:
wb = openpyxl.load_workbook('result.xlsx')
ws1 = wb.create_sheet("RF with SMOTE")    
ws1.cell(row = 1, column = 1).value ='train set'
ws1.cell(row = 1, column = 2).value ='test set'
ws1.cell(row = 1, column = 3).value ='accuracy score'
ws1.cell(row = 1, column = 4).value ='recall score'
ws1.cell(row = 1, column = 5).value ='precision score'
ws1.cell(row = 1, column = 6).value ='f1 score'
ws1.cell(row = 1, column = 7).value ='0.25(Accuracy+Recall+Precision+F1)'

for n1, i in enumerate(dic):
    for n2, j in enumerate(dic):
        if i == j:
            tmp1 = train_test_same_version(dic[i], 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'), SMOTE_train_data_resample)
            print(i, j)
            print(tmp1[4:])
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 1).value = i
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 2).value = j
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 3).value = tmp1[4]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 4).value = tmp1[5]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 5).value = tmp1[6]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 6).value = tmp1[7]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 7).value = tmp1[8]
        else:
            tmp2 = train_test_same_version(dic[i], 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'), SMOTE_train_data_resample)
            print(i, j)
            print(tmp2[4:])
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 1).value = i
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 2).value = j
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 3).value = tmp2[4]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 4).value = tmp2[5]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 5).value = tmp2[6]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 6).value = tmp2[7]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 7).value = tmp2[8]

wb.save("result.xlsx")

Eclipse_JDT_Core Eclipse_JDT_Core
[0.8254999999999999, 0.635311109758361, 0.5890856929402533, 0.6076647171854398, 0.6643903799710135]
Eclipse_JDT_Core Eclipse_PDE_UI
[0.834, 0.6432529679675102, 0.607144162478811, 0.6158781297486741, 0.6750688150487488]
Eclipse_JDT_Core Equinox_Framework
[0.8409999999999999, 0.6587850310605727, 0.6039659979096925, 0.6269725482656316, 0.6826808943089742]
Eclipse_JDT_Core Lucene
[0.8345, 0.6646196049470031, 0.5713313925547967, 0.6086433395079698, 0.6697735842524424]
Eclipse_JDT_Core Mylyn
[0.8324999999999999, 0.6634724998939362, 0.6113441275118084, 0.6295934219898297, 0.6842275123488936]
Eclipse_PDE_UI Eclipse_JDT_Core
[0.727, 0.6115897097267071, 0.27340352213721403, 0.3691941389480643, 0.49529684270299634]
Eclipse_PDE_UI Eclipse_PDE_UI
[0.7646666666666667, 0.5273882498265323, 0.32523687394862666, 0.3922233564131833, 0.5023787867137522]
Eclipse_PDE_UI Equinox_Framework
[0.7256666666666666, 0.5823901661300855, 0.30286919669984985, 0.39087309662741926, 0.50

## Just RandomForest

In [16]:
wb = openpyxl.load_workbook('result.xlsx')
ws1 = wb.create_sheet("RF")    
ws1.cell(row = 1, column = 1).value ='train set'
ws1.cell(row = 1, column = 2).value ='test set'
ws1.cell(row = 1, column = 3).value ='accuracy score'
ws1.cell(row = 1, column = 4).value ='recall score'
ws1.cell(row = 1, column = 5).value ='precision score'
ws1.cell(row = 1, column = 6).value ='f1 score'
ws1.cell(row = 1, column = 7).value ='0.25(Accuracy+Recall+Precision+F1)'

for n1, i in enumerate(dic):
    for n2, j in enumerate(dic):
        if i == j:
            tmp1 = train_test_same_version(dic[i], 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'), no_train_data_resample)
            print(i, j)
            print(tmp1[4:])
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 1).value = i
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 2).value = j
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 3).value = tmp1[4]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 4).value = tmp1[5]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 5).value = tmp1[6]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 6).value = tmp1[7]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 7).value = tmp1[8]
        else:
            tmp2 = train_test_same_version(dic[i], 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'), no_train_data_resample)
            print(i, j)
            print(tmp2[4:])
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 1).value = i
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 2).value = j
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 3).value = tmp2[4]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 4).value = tmp2[5]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 5).value = tmp2[6]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 6).value = tmp2[7]
            ws1.cell(row = 2 + len(dic) * n1 + n2, column = 7).value = tmp2[8]

wb.save("result.xlsx")

Eclipse_JDT_Core Eclipse_JDT_Core
[0.867, 0.4991913761925891, 0.7804859933064703, 0.6070869147238698, 0.6884410710557323]
Eclipse_JDT_Core Eclipse_PDE_UI
[0.8515, 0.4790927612975078, 0.7471144743064666, 0.5816268826945235, 0.6648335295746246]
Eclipse_JDT_Core Equinox_Framework
[0.8535, 0.4739667650169337, 0.7352746939337595, 0.5737697642745224, 0.6591278058063039]
Eclipse_JDT_Core Lucene
[0.8615, 0.496245013558949, 0.7538457617651166, 0.5935012678689034, 0.6762730107982423]
Eclipse_JDT_Core Mylyn
[0.867, 0.5156706254170953, 0.7125671868980692, 0.5954432103839953, 0.6726702556747899]
Eclipse_PDE_UI Eclipse_JDT_Core
[0.8676666666666666, 0.1773134061221642, 0.7100310873337189, 0.27687490957614797, 0.5079715174246744]
Eclipse_PDE_UI Eclipse_PDE_UI
[0.8756666666666668, 0.19963871418971874, 0.6345323149502716, 0.29798092102126705, 0.501954654206981]
Eclipse_PDE_UI Equinox_Framework
[0.8616666666666667, 0.16487704245969154, 0.5675775613275614, 0.247902810387094, 0.4605060202102534]
Eclipse_PD