In [27]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score  
from sklearn.metrics import recall_score  
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn import neighbors
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
import matplotlib.pyplot as plt

In [2]:
def get_the_predicted_bugs(name, num1, num2):
    All_Data = pd.read_csv(name, usecols = [i for i in range(num1, num2)]).values
    return All_Data

In [6]:
data = get_the_predicted_bugs('modified-eclipse-metrics-files-3.0.csv', 2, 202)

In [7]:
data

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.02631579],
       [4.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.02757916],
       [2.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00821596],
       ...,
       [6.        , 0.        , 3.        , ..., 0.        , 0.        ,
        0.035     ],
       [0.        , 2.        , 0.        , ..., 0.        , 0.        ,
        0.02631579],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.0625    ]])

## Logistic Regression

## best f1_score

In [19]:
def change_to_binary(x):
    for i in range(len(x)):
        if x[i] == 0:
            continue
        elif x[i] > 0:
            x[i] = 1
        else:
            print('Wrong.')
            return
    return x

In [20]:
data_random = np.random.permutation(data)
x = data_random[:, 3:]
y = change_to_binary(data_random[:, 1]) #post
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
lr_clf = LogisticRegression()

In [23]:
param_grid = [
             {
                 'penalty' : ['l1'],
                 'C' : [0.1, 1, 10],
                 'solver' : ['liblinear'],
                 'class_weight' : [{0 : 0.1, 1 : 0.9}, {0 : 0.2, 1 : 0.8}, {0 : 0.3, 1 : 0.7}, {0 : 0.4, 1 : 0.6}, {0 : 0.5, 1 : 0.5}]
             },
             {
                 'penalty' : ['l2'],
                 'C' : [0.1, 1, 10],
                 'solver' : ['newton-cg', 'lbfgs', 'sag', 'liblinear'],
                 'class_weight' : [{0 : 0.1, 1 : 0.9}, {0 : 0.2, 1 : 0.8}, {0 : 0.3, 1 : 0.7}, {0 : 0.4, 1 : 0.6}, {0 : 0.5, 1 : 0.5}]
             }
             ]
grid_search = GridSearchCV(lr_clf, param_grid, scoring = 'f1', n_jobs = -1, cv = 5)

In [24]:
grid_search.fit(x_train, y_train)



GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'penalty': ['l1'], 'C': [0.1, 1, 10], 'solver': ['liblinear'], 'class_weight': [{0: 0.1, 1: 0.9}, {0: 0.2, 1: 0.8}, {0: 0.3, 1: 0.7}, {0: 0.4, 1: 0.6}, {0: 0.5, 1: 0.5}]}, {'penalty': ['l2'], 'C': [0.1, 1, 10], 'solver': ['newton-cg', 'lbfgs', 'sag', 'liblinear'], 'class_weight': [{0: 0.1, 1: 0.9}, {0: 0.2, 1: 0.8}, {0: 0.3, 1: 0.7}, {0: 0.4, 1: 0.6}, {0: 0.5, 1: 0.5}]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1', verbose=0)

In [25]:
grid_search.best_params_

{'C': 10,
 'class_weight': {0: 0.2, 1: 0.8},
 'penalty': 'l2',
 'solver': 'newton-cg'}

In [26]:
grid_search.best_estimator_

LogisticRegression(C=10, class_weight={0: 0.2, 1: 0.8}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)

In [28]:
y_pre = grid_search.best_estimator_.predict(x_test)
accuracy_score(y_test, y_pre), recall_score(y_test, y_pre), precision_score(y_test, y_pre), f1_score(y_test, y_pre)

(0.8206701274185937,
 0.5254777070063694,
 0.4166666666666667,
 0.4647887323943662)

## Get the average

In [39]:
accuracy_score_list = []
recall_score_list = []
precision_score_list = []
f1_score_list = []
for i in range(10):
    data_random = np.random.permutation(data)
    x = data_random[:, 3:]
    y = change_to_binary(data_random[:, 1])
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
    lr_clf = LogisticRegression(C=10, class_weight={0: 0.2, 1: 0.8}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)
    lr_clf.fit(x_train, y_train)
    y_pre = lr_clf.predict(x_test)
    accuracy_score_list.append(accuracy_score(y_test, y_pre))
    recall_score_list.append(recall_score(y_test, y_pre))
    precision_score_list.append(precision_score(y_test, y_pre))
    f1_score_list.append(f1_score(y_test, y_pre))
    print(i)

0
1
2
3
4
5
6
7
8
9


In [40]:
np.mean(accuracy_score_list)

0.8074563473336479

In [41]:
np.mean(recall_score_list)

0.524849827575472

In [42]:
np.mean(precision_score_list)

0.3901973863632261

In [43]:
np.mean(f1_score_list)

0.44716659832148836

In [45]:
sum(y)/len(y)

0.14802227886340036

## best accuracy_score¶

In [46]:
def change_to_binary(x):
    for i in range(len(x)):
        if x[i] == 0:
            continue
        elif x[i] > 0:
            x[i] = 1
        else:
            print('Wrong.')
            return
    return x

In [81]:
data_random = np.random.permutation(data)
x = data_random[:, 3:]
y = change_to_binary(data_random[:, 1]) #post
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)
lr_clf = LogisticRegression()

In [82]:
param_grid = [
             {
                 'max_iter' : [1000],
                 'penalty' : ['l2'],
                 'C' : [1],
                 'solver' : ['newton-cg'],
                 'class_weight' : [{0 : 0.1, 1 : 0.9}, {0 : 0.2, 1 : 0.8}, {0 : 0.3, 1 : 0.7}, {0 : 0.4, 1 : 0.6}, {0 : 0.5, 1 : 0.5}, 
                                   {0 : 0.6, 1 : 0.4}, {0 : 0.7, 1 : 0.3}, {0 : 0.8, 1 : 0.2}, {0 : 0.9, 1 : 0.1}]
             }
             ]
grid_search = GridSearchCV(lr_clf, param_grid, scoring = 'accuracy', n_jobs = -1, cv = 5)

In [83]:
grid_search.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'max_iter': [1000], 'penalty': ['l2'], 'C': [1], 'solver': ['newton-cg'], 'class_weight': [{0: 0.1, 1: 0.9}, {0: 0.2, 1: 0.8}, {0: 0.3, 1: 0.7}, {0: 0.4, 1: 0.6}, {0: 0.5, 1: 0.5}, {0: 0.6, 1: 0.4}, {0: 0.7, 1: 0.3}, {0: 0.8, 1: 0.2}, {0: 0.9, 1: 0.1}]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [84]:
grid_search.best_params_

{'C': 1,
 'class_weight': {0: 0.5, 1: 0.5},
 'max_iter': 1000,
 'penalty': 'l2',
 'solver': 'newton-cg'}

In [85]:
grid_search.best_estimator_

LogisticRegression(C=1, class_weight={0: 0.5, 1: 0.5}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=1000,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)

In [86]:
y_pre = grid_search.best_estimator_.predict(x_test)
accuracy_score(y_test, y_pre), recall_score(y_test, y_pre), precision_score(y_test, y_pre), f1_score(y_test, y_pre)

(0.8617272298253893,
 0.21818181818181817,
 0.6728971962616822,
 0.3295194508009153)

## Get the average

In [92]:
accuracy_score_list = []
recall_score_list = []
precision_score_list = []
f1_score_list = []
for i in range(10):
    data_random = np.random.permutation(data)
    x = data_random[:, 3:]
    y = change_to_binary(data_random[:, 1])
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)
    lr_clf = LogisticRegression(C = 1, class_weight={0: 0.5, 1: 0.5}, max_iter=1000, penalty='l2', solver='newton-cg')
    lr_clf.fit(x_train, y_train)
    y_pre = lr_clf.predict(x_test)
    accuracy_score_list.append(accuracy_score(y_test, y_pre))
    recall_score_list.append(recall_score(y_test, y_pre))
    precision_score_list.append(precision_score(y_test, y_pre))
    f1_score_list.append(f1_score(y_test, y_pre))

In [93]:
accuracy_score_list, np.mean(accuracy_score_list)

([0.8603114676734309,
  0.8629070316186881,
  0.856536101934875,
  0.8588957055214724,
  0.8638508730533271,
  0.856536101934875,
  0.8581878244454931,
  0.8631429919773478,
  0.8591316658801321,
  0.8624351109013686],
 0.8601934874941011)

In [94]:
recall_score_list, np.mean(recall_score_list)

([0.19393939393939394,
  0.20294599018003273,
  0.19254658385093168,
  0.19937694704049844,
  0.2082018927444795,
  0.1951588502269289,
  0.2059282371294852,
  0.1858974358974359,
  0.1796875,
  0.21565495207667731],
 0.19793377830858636)

In [95]:
precision_score_list, np.mean(precision_score_list)

([0.6808510638297872,
  0.5688073394495413,
  0.5849056603773585,
  0.6037735849056604,
  0.6376811594202898,
  0.6292682926829268,
  0.5892857142857143,
  0.6170212765957447,
  0.6149732620320856,
  0.5947136563876652],
 0.6121281009966774)

In [96]:
f1_score_list, np.mean(f1_score_list)

([0.3018867924528302,
  0.2991556091676719,
  0.28971962616822433,
  0.2997658079625293,
  0.3139120095124851,
  0.29792147806004615,
  0.30520231213872834,
  0.2857142857142857,
  0.2781136638452237,
  0.31652989449003516],
 0.29879214795120596)