In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score  
from sklearn.metrics import recall_score  
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.svm import OneClassSVM
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn import neighbors
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

In [2]:
def get_the_predicted_bugs(name, num1, num2):
    All_Data = pd.read_csv(name, usecols = [i for i in range(num1, num2)]).values
    return All_Data

In [3]:
data = get_the_predicted_bugs('modified-eclipse-metrics-files-3.0.csv', 2, 202)

## remove some columns whose element is all the same

In [4]:
def all_same(x):
    return True if len(set(x)) == 1 else False

In [5]:
all_same_index_list = []
for i in range(len(data[0])):
    if all_same(data[: ,i]):
        all_same_index_list.append(i)
print(all_same_index_list)
data_new = np.delete(data, all_same_index_list, axis = 1)

[38, 47, 67, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 122, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198]


## Ty different algorithms

In [6]:
def change_to_binary(x):
    for i in range(len(x)):
        if x[i] == 0:
            continue
        elif x[i] > 0:
            x[i] = 1
        else:
            print('Wrong.')
            return
    return x

In [7]:
def alg_try(times, required_data, method):
    accuracy_score_list = []
    recall_score_list = []
    precision_score_list = []
    f1_score_list = []
    for i in range(times):
        data_random = np.random.permutation(required_data)
        x = data_random[:, 3:]
        y = change_to_binary(data_random[:, 1])
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
        clf = method
        clf.fit(x_train, y_train)
        y_pre = clf.predict(x_test)
        accuracy_score_list.append(accuracy_score(y_test, y_pre))
        recall_score_list.append(recall_score(y_test, y_pre))
        precision_score_list.append(precision_score(y_test, y_pre))
        f1_score_list.append(f1_score(y_test, y_pre))
    return [accuracy_score_list, recall_score_list, precision_score_list, f1_score_list, 
            np.mean(accuracy_score_list), np.mean(recall_score_list), 
            np.mean(precision_score_list), np.mean(f1_score_list)]

## K-nearest neighbors

In [8]:
alg_try(10, data_new, KNeighborsClassifier())

[[0.8404907975460123,
  0.8419065596979708,
  0.8338839075035394,
  0.8381311939594148,
  0.8489853704577631,
  0.8362435110901368,
  0.8461538461538461,
  0.8244454931571495,
  0.8409627182633318,
  0.8489853704577631],
 [0.22955974842767296,
  0.2392638036809816,
  0.22884012539184953,
  0.23624595469255663,
  0.21666666666666667,
  0.20967741935483872,
  0.24444444444444444,
  0.18911174785100288,
  0.24367088607594936,
  0.22330097087378642],
 [0.4397590361445783,
  0.4727272727272727,
  0.40782122905027934,
  0.40555555555555556,
  0.43333333333333335,
  0.38922155688622756,
  0.4666666666666667,
  0.4258064516129032,
  0.44,
  0.46308724832214765],
 [0.30165289256198347,
  0.31771894093686354,
  0.2931726907630522,
  0.29856850715746425,
  0.2888888888888889,
  0.2725366876310273,
  0.3208333333333333,
  0.26190476190476186,
  0.3136456211812627,
  0.3013100436681223],
 0.8400188768286927,
 0.2260781767459749,
 0.4343978350298965,
 0.297023236802676]

## Naïve Bayes

In [9]:
alg_try(10, data_new, GaussianNB())

[[0.8348277489381783,
  0.8409627182633318,
  0.8437942425672487,
  0.8390750353940538,
  0.8456819254365266,
  0.8419065596979708,
  0.8367154318074563,
  0.8480415290231241,
  0.8423784804152902,
  0.8480415290231241],
 [0.29780564263322884,
  0.28700906344410876,
  0.3256578947368421,
  0.29393939393939394,
  0.3108974358974359,
  0.2627986348122867,
  0.30618892508143325,
  0.29900332225913623,
  0.33440514469453375,
  0.3108974358974359],
 [0.4298642533936652,
  0.4846938775510204,
  0.44,
  0.47317073170731705,
  0.46411483253588515,
  0.39285714285714285,
  0.41409691629955947,
  0.44776119402985076,
  0.45021645021645024,
  0.47549019607843135],
 [0.35185185185185186,
  0.3605313092979127,
  0.3742911153119093,
  0.3626168224299065,
  0.3723608445297505,
  0.3149284253578732,
  0.35205992509363293,
  0.35856573705179284,
  0.3837638376383764,
  0.375968992248062],
 0.8421425200566304,
 0.3028602893395836,
 0.4472265594669322,
 0.3606938860811068]

In [10]:
alg_try(10, data_new, BernoulliNB())

[[0.6210476639924493,
  0.6097215667767815,
  0.6186880604058518,
  0.6271826333176026,
  0.6134969325153374,
  0.6116092496460594,
  0.6262387918829636,
  0.6224634261444077,
  0.6172722982538933,
  0.6361491269466729],
 [0.8070175438596491,
  0.7523809523809524,
  0.7610062893081762,
  0.7645161290322581,
  0.7941176470588235,
  0.7573770491803279,
  0.7597402597402597,
  0.7785016286644951,
  0.7327044025157232,
  0.7886904761904762],
 [0.2724580454096742,
  0.24036511156186613,
  0.24845995893223818,
  0.24842767295597484,
  0.24324324324324326,
  0.2357142857142857,
  0.24579831932773108,
  0.24613800205973224,
  0.2429614181438999,
  0.27461139896373055],
 [0.407380073800738,
  0.36433512682551883,
  0.37461300309597517,
  0.375,
  0.37241379310344824,
  0.3595330739299611,
  0.37142857142857144,
  0.37402190923317685,
  0.36491777603758807,
  0.40737893927747887],
 0.620386974988202,
 0.7696052377931142,
 0.2498177456312376,
 0.37710222667324567]

## Logistic Regression

In [16]:
alg_try(10, data_new, LogisticRegression(class_weight={0: 0.125, 1: 0.875}))



[[0.7031618688060406,
  0.7229825389334592,
  0.7036337895233601,
  0.7064653138272771,
  0.7078810759792354,
  0.7182633317602642,
  0.7239263803680982,
  0.6956111373289288,
  0.7229825389334592,
  0.7210948560641812],
 [0.7329192546583851,
  0.6586102719033232,
  0.7142857142857143,
  0.75,
  0.706060606060606,
  0.7217125382262997,
  0.7018072289156626,
  0.7033333333333334,
  0.670846394984326,
  0.6962025316455697],
 [0.3029525032092426,
  0.315028901734104,
  0.3002610966057441,
  0.3189873417721519,
  0.3086092715231788,
  0.31805929919137466,
  0.3240611961057024,
  0.27509778357235987,
  0.3074712643678161,
  0.3076923076923077],
 [0.4287011807447775,
  0.426197458455523,
  0.4227941176470589,
  0.4476021314387212,
  0.4294930875576037,
  0.44153414405986907,
  0.44338725023786874,
  0.3955014058106842,
  0.4216748768472906,
  0.42677012609117365],
 0.7126002831524304,
 0.705577787401322,
 0.30782209657739823,
 0.42836557788905705]

## Decision Tree

In [21]:
alg_try(10, data_new, DecisionTreeClassifier())

[[0.8013213780084946,
  0.7833883907503539,
  0.8065125058990089,
  0.7999056158565361,
  0.8069844266163284,
  0.7951864086833412,
  0.7970740915526192,
  0.8079282680509674,
  0.7975460122699386,
  0.8008494572911751],
 [0.33430232558139533,
  0.33762057877813506,
  0.3431372549019608,
  0.36923076923076925,
  0.3787878787878788,
  0.37216828478964403,
  0.3333333333333333,
  0.3825301204819277,
  0.3717948717948718,
  0.3384146341463415],
 [0.3745928338762215,
  0.29329608938547486,
  0.3343949044585987,
  0.35398230088495575,
  0.3799392097264438,
  0.323943661971831,
  0.33126934984520123,
  0.3860182370820669,
  0.332378223495702,
  0.3512658227848101],
 [0.3533026113671275,
  0.31390134529147984,
  0.33870967741935487,
  0.36144578313253006,
  0.37936267071320184,
  0.3463855421686747,
  0.3322981366459627,
  0.3842662632375189,
  0.35098335854765506,
  0.3447204968944099],
 0.7996696554978764,
 0.3561320051826257,
 0.3461080633511306,
 0.3505375885417915]

## Neural Networks  

In [80]:
def alg_try_sm(times, required_data, method):
    accuracy_score_list = []
    recall_score_list = []
    precision_score_list = []
    f1_score_list = []
    for i in range(times):
        data_random = np.random.permutation(required_data)
        x = data_random[:, 3:]
        y = change_to_binary(data_random[:, 1])
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
        sm = SMOTE()
        x_train_new, y_train_new = sm.fit_resample(x_train, y_train)
        clf = method
        clf.fit(x_train_new, y_train_new)
        y_pre = clf.predict(x_test)
        accuracy_score_list.append(accuracy_score(y_test, y_pre))
        recall_score_list.append(recall_score(y_test, y_pre))
        precision_score_list.append(precision_score(y_test, y_pre))
        f1_score_list.append(f1_score(y_test, y_pre))
    return [accuracy_score_list, recall_score_list, precision_score_list, f1_score_list, 
            np.mean(accuracy_score_list), np.mean(recall_score_list), 
            np.mean(precision_score_list), np.mean(f1_score_list)]

In [81]:
alg_try_sm(10, data_new, MLPClassifier(hidden_layer_sizes=(50, 50)))



[[0.7711184521000471,
  0.7645115620575743,
  0.7347805568664464,
  0.7914110429447853,
  0.7404436054742803,
  0.732420953279849,
  0.6956111373289288,
  0.7479943369513922,
  0.6479471448796602,
  0.7371401604530439],
 [0.4862068965517241,
  0.645367412140575,
  0.6171617161716172,
  0.6021505376344086,
  0.5785953177257525,
  0.5894039735099338,
  0.7207792207792207,
  0.6451612903225806,
  0.7516129032258064,
  0.6148648648648649],
 [0.29559748427672955,
  0.3423728813559322,
  0.29541864139020535,
  0.3366733466933868,
  0.2897822445561139,
  0.286634460547504,
  0.28425096030729835,
  0.32051282051282054,
  0.2583148558758315,
  0.2912],
 [0.36766623207301175,
  0.4473975636766334,
  0.39957264957264954,
  0.4318766066838046,
  0.38616071428571425,
  0.3856988082340195,
  0.4077134986225895,
  0.4282655246252677,
  0.3844884488448845,
  0.3952225841476656],
 0.7363378952336007,
 0.6251304132926485,
 0.3000757695515822,
 0.4034062630766241]