In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score  
from sklearn.metrics import recall_score  
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.svm import OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN 
from imblearn.combine import SMOTETomek

In [3]:
def get_the_predicted_bugs(name, num1, num2):
    All_Data = pd.read_csv(name, usecols = [i for i in range(num1, num2)]).values
    return All_Data

In [4]:
data_20 = get_the_predicted_bugs('modified-eclipse-metrics-files-2.0.csv', 2, 202)
data_21 = get_the_predicted_bugs('modified-eclipse-metrics-files-2.1.csv', 2, 202)
data_30 = get_the_predicted_bugs('modified-eclipse-metrics-files-3.0.csv', 2, 202)

## remove some columns whose element is all the same

In [5]:
def all_same(x):
    return True if len(set(x)) == 1 else False

In [6]:
def remove_col(data):
    all_same_index_list = []
    for i in range(len(data[0])):
        if all_same(data[: ,i]):
            all_same_index_list.append(i)
    data_new = np.delete(data, all_same_index_list, axis = 1)
    return [all_same_index_list, data_new]    

In [7]:
[all_same_index_list_20, data_new_20] = remove_col(data_20)
[all_same_index_list_21, data_new_21] = remove_col(data_21)
[all_same_index_list_30, data_new_30] = remove_col(data_30)

## Try different algorithms

In [8]:
def change_to_binary(x):
    for i in range(len(x)):
        if x[i] == 0:
            continue
        elif x[i] > 0:
            x[i] = 1
        else:
            print('Wrong.')
            return
    return x

In [9]:
def alg_try(x_train, x_test, y_train, y_test, times, method):
    accuracy_score_list = []
    recall_score_list = []
    precision_score_list = []
    f1_score_list = []
    for i in range(times):
        clf = method
        clf.fit(x_train, y_train)
        y_pre = clf.predict(x_test)
        accuracy_score_list.append(accuracy_score(y_test, y_pre))
        recall_score_list.append(recall_score(y_test, y_pre))
        precision_score_list.append(precision_score(y_test, y_pre))
        f1_score_list.append(f1_score(y_test, y_pre))
    return [accuracy_score_list, recall_score_list, precision_score_list, f1_score_list, 
            np.mean(accuracy_score_list), np.mean(recall_score_list), 
            np.mean(precision_score_list), np.mean(f1_score_list)]

In [10]:
def alg_try_sm(x_train, x_test, y_train, y_test, times, method):
    accuracy_score_list = []
    recall_score_list = []
    precision_score_list = []
    f1_score_list = []
    for i in range(times):
        sm = SMOTEENN()
        x_train_new, y_train_new = sm.fit_resample(x_train, y_train)
        clf = method
        clf.fit(x_train_new, y_train_new)
        y_pre = clf.predict(x_test)
        accuracy_score_list.append(accuracy_score(y_test, y_pre))
        recall_score_list.append(recall_score(y_test, y_pre))
        precision_score_list.append(precision_score(y_test, y_pre))
        f1_score_list.append(f1_score(y_test, y_pre))
    return [accuracy_score_list, recall_score_list, precision_score_list, f1_score_list, 
            np.mean(accuracy_score_list), np.mean(recall_score_list), 
            np.mean(precision_score_list), np.mean(f1_score_list)]

In [11]:
def train_test_same_version(data, func, times, method):
    data_random = np.random.permutation(data)
    x = data_random[:, 3:]
    y = change_to_binary(data_random[:, 1])
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
    return func(x_train, x_test, y_train, y_test, times, method)

In [12]:
def train_test_diff_version(data1, data2, func, times, method):
    data1_random = np.random.permutation(data1)
    data2_random = np.random.permutation(data2)
    x_train = data1_random[:, 3:]
    y_train = change_to_binary(data1_random[:, 1])
    x_test = data2_random[:, 3:]
    y_test = change_to_binary(data2_random[:, 1])
    return func(x_train, x_test, y_train, y_test, times, method)

## LogisticRegression

In [114]:
print('train 2.0; test 2.0')
print(train_test_same_version(data_new_20, alg_try, 10, LogisticRegression(solver = 'liblinear', max_iter = 1000, class_weight = {0: 0.125, 1: 0.875}))[4:])
print('train 2.0; test 2.1')
print(train_test_diff_version(data_new_20, data_new_21, alg_try, 10, LogisticRegression(solver = 'liblinear', max_iter = 1000, class_weight = {0: 0.125, 1: 0.875}))[4:])
print('train 2.0; test 3.0')
print(train_test_diff_version(data_new_20, data_new_30, alg_try, 10, LogisticRegression(solver = 'liblinear', max_iter = 1000, class_weight = {0: 0.125, 1: 0.875}))[4:])
print('train 2.1; test 2.0')
print(train_test_diff_version(data_new_21, data_new_20, alg_try, 10, LogisticRegression(solver = 'liblinear', max_iter = 1000, class_weight = {0: 0.125, 1: 0.875}))[4:])
print('train 2.1; test 2.1')
print(train_test_same_version(data_new_21, alg_try, 10, LogisticRegression(solver = 'liblinear', max_iter = 1000, class_weight = {0: 0.125, 1: 0.875}))[4:])
print('train 2.1; test 3.0')
print(train_test_diff_version(data_new_21, data_new_30, alg_try, 10, LogisticRegression(solver = 'liblinear', max_iter = 1000, class_weight = {0: 0.125, 1: 0.875}))[4:])
print('train 3.0; test 2.0')
print(train_test_diff_version(data_new_30, data_new_20, alg_try, 10, LogisticRegression(solver = 'liblinear', max_iter = 1000, class_weight = {0: 0.125, 1: 0.875}))[4:])
print('train 3.0; test 2.1')
print(train_test_diff_version(data_new_30, data_new_21, alg_try, 10, LogisticRegression(solver = 'liblinear', max_iter = 1000, class_weight = {0: 0.125, 1: 0.875}))[4:])
print('train 3.0; test 3.0')
print(train_test_same_version(data_new_30, alg_try, 10, LogisticRegression(solver = 'liblinear', max_iter = 1000, class_weight = {0: 0.125, 1: 0.875}))[4:])

train 2.0; test 2.0
[0.7852897473997028, 0.7066666666666668, 0.4162303664921466, 0.5238879736408568]
train 2.0; test 2.1
[0.7184330628803244, 0.6288056206088993, 0.21999180663662435, 0.32594840667678293]
train 2.0; test 3.0
[0.7240630605116587, 0.6390306122448981, 0.2983030663888062, 0.40673838035315607]
train 2.1; test 2.0
[0.815722990043097, 0.5538461538461539, 0.4014869888475837, 0.4655172413793104]
train 2.1; test 2.1
[0.805449936628644, 0.5428571428571428, 0.2950310559006212, 0.38229376257545267]
train 2.1; test 3.0
[0.778344189559143, 0.48788265306122447, 0.3311688311688311, 0.39453326456936566]
train 3.0; test 2.0
[0.7278941893297668, 0.6964102564102566, 0.30668473351400183, 0.42583882094700537]
train 3.0; test 2.1
[0.6885141987829615, 0.6604215456674474, 0.20651775906261444, 0.31464435146443515]
train 3.0; test 3.0
[0.7074091552619158, 0.7060702875399361, 0.29506008010680906, 0.41619585687382293]


## RandomForest with SMOTEENN

In [123]:
print('train 2.0; test 2.0')
print(train_test_same_version(data_new_20, alg_try_sm, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 2.0; test 2.1')
print(train_test_diff_version(data_new_20, data_new_21, alg_try_sm, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 2.0; test 3.0')
print(train_test_diff_version(data_new_20, data_new_30, alg_try_sm, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 2.1; test 2.0')
print(train_test_diff_version(data_new_21, data_new_20, alg_try_sm, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 2.1; test 2.1')
print(train_test_same_version(data_new_21, alg_try_sm, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 2.1; test 3.0')
print(train_test_diff_version(data_new_21, data_new_30, alg_try_sm, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 3.0; test 2.0')
print(train_test_diff_version(data_new_30, data_new_20, alg_try_sm, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 3.0; test 2.1')
print(train_test_diff_version(data_new_30, data_new_21, alg_try_sm, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 3.0; test 3.0')
print(train_test_same_version(data_new_30, alg_try_sm, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])

train 2.0; test 2.0
[0.8003714710252601, 0.7867052023121388, 0.3700141072581103, 0.5032695753311829]
train 2.0; test 2.1
[0.7578220081135904, 0.5798594847775175, 0.2419651048110132, 0.3414410142094335]
train 2.0; test 3.0
[0.7648730293590107, 0.5829081632653061, 0.3323012287021665, 0.42327645314914736]
train 2.1; test 2.0
[0.7957200178332592, 0.5346666666666666, 0.3614831817365686, 0.43132818789954985]
train 2.1; test 2.1
[0.7812420785804817, 0.6313725490196077, 0.25073963224075463, 0.3589043358531363]
train 2.1; test 3.0
[0.7710469177758897, 0.5131377551020407, 0.3262482372287185, 0.3988613155763586]
train 3.0; test 2.0
[0.7672016644375093, 0.6415384615384615, 0.33947535343434665, 0.44398669706289323]
train 3.0; test 2.1
[0.7360420892494929, 0.5903981264637002, 0.2254689854468806, 0.32630790429920886]
train 3.0; test 3.0
[0.7663048607833884, 0.6545180722891566, 0.3635305680662019, 0.467407268932023]


## RandomForest with SMOTE

In [13]:
def alg_try_smote(x_train, x_test, y_train, y_test, times, method):
    accuracy_score_list = []
    recall_score_list = []
    precision_score_list = []
    f1_score_list = []
    for i in range(times):
        sm = SMOTE()
        x_train_new, y_train_new = sm.fit_resample(x_train, y_train)
        clf = method
        clf.fit(x_train_new, y_train_new)
        y_pre = clf.predict(x_test)
        accuracy_score_list.append(accuracy_score(y_test, y_pre))
        recall_score_list.append(recall_score(y_test, y_pre))
        precision_score_list.append(precision_score(y_test, y_pre))
        f1_score_list.append(f1_score(y_test, y_pre))
    return [accuracy_score_list, recall_score_list, precision_score_list, f1_score_list, 
            np.mean(accuracy_score_list), np.mean(recall_score_list), 
            np.mean(precision_score_list), np.mean(f1_score_list)]

In [14]:
print('train 2.0; test 2.0')
print(train_test_same_version(data_new_20, alg_try_smote, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 2.0; test 2.1')
print(train_test_diff_version(data_new_20, data_new_21, alg_try_smote, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 2.0; test 3.0')
print(train_test_diff_version(data_new_20, data_new_30, alg_try_smote, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 2.1; test 2.0')
print(train_test_diff_version(data_new_21, data_new_20, alg_try_smote, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 2.1; test 2.1')
print(train_test_same_version(data_new_21, alg_try_smote, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 2.1; test 3.0')
print(train_test_diff_version(data_new_21, data_new_30, alg_try_smote, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 3.0; test 2.0')
print(train_test_diff_version(data_new_30, data_new_20, alg_try_smote, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 3.0; test 2.1')
print(train_test_diff_version(data_new_30, data_new_21, alg_try_smote, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])
print('train 3.0; test 3.0')
print(train_test_same_version(data_new_30, alg_try_smote, 10, RandomForestClassifier(n_estimators = 200, criterion = 'entropy'))[4:])

train 2.0; test 2.0
[0.8844725111441308, 0.5203883495145631, 0.6542679666101474, 0.5796191986864663]
train 2.0; test 2.1
[0.8441556795131845, 0.35960187353629974, 0.31035943082922757, 0.33316731423771234]
train 2.0; test 3.0
[0.8377607854243369, 0.3683035714285714, 0.44232061146348833, 0.4019231773202324]
train 2.1; test 2.0
[0.8462178629811264, 0.22953846153846152, 0.4410700571649849, 0.3019398858736281]
train 2.1; test 2.1
[0.8927122940430925, 0.305421686746988, 0.48453110751653605, 0.37459323952008927]
train 2.1; test 3.0
[0.840016992353441, 0.2257015306122449, 0.42409156531866454, 0.29460630904852736]
train 3.0; test 2.0
[0.845905780948135, 0.31158974358974356, 0.45381780974190145, 0.3694682172014726]
train 3.0; test 2.1
[0.8481364097363082, 0.30597189695550353, 0.3015462398592653, 0.30374018570210215]
train 3.0; test 3.0
[0.855592260500236, 0.4162721893491125, 0.5642388662783888, 0.4790281762902593]
