# ECIP Models for Software Defect Prediction Dataset

## Import

In [1]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix
import pandas as pd
import numpy as np
from collections import Counter
import random
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier

import math


In [2]:
df = pd.read_csv("./nasa.csv")
df.head()

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.3,...,2,2,2,2,1.2,1.2,1.2,1.2,1.4,False
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1,1,1,1,1.0,1.0,1.0,1.0,1.0,True
2,72.0,7.0,1.0,6.0,198.0,1134.13,0.05,20.31,55.85,23029.1,...,51,10,8,1,17.0,36.0,112.0,86.0,13.0,True
3,190.0,3.0,1.0,3.0,600.0,4348.76,0.06,17.06,254.87,74202.67,...,129,29,28,2,17.0,135.0,329.0,271.0,5.0,True
4,37.0,4.0,1.0,4.0,126.0,599.12,0.06,17.19,34.86,10297.3,...,28,1,6,0,11.0,16.0,76.0,50.0,7.0,True


## Data Preprocessing

In [3]:
df.corr()

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,b,t,lOCode,lOComment,lOBlank,locCodeAndComment,defects
loc,1.0,0.817757,0.517551,0.784057,0.881795,0.900293,-0.286587,0.689543,0.499946,0.750564,0.899965,0.750564,0.921918,0.612858,0.803573,0.278119,0.245388
v(g),0.817757,1.0,0.70171,0.85959,0.730781,0.759881,-0.252902,0.669057,0.303031,0.709501,0.759635,0.709501,0.799915,0.384506,0.538366,0.209811,0.208644
ev(g),0.517551,0.70171,1.0,0.639574,0.465992,0.445902,-0.233982,0.434009,0.213211,0.315538,0.445693,0.315538,0.454604,0.294208,0.338243,0.190911,0.172973
iv(g),0.784057,0.85959,0.639574,1.0,0.702415,0.743193,-0.197736,0.575369,0.309717,0.757702,0.743013,0.757702,0.775873,0.351583,0.541296,0.207028,0.181984
n,0.881795,0.730781,0.465992,0.702415,1.0,0.984276,-0.240749,0.808113,0.651209,0.716536,0.983938,0.716536,0.944383,0.596374,0.798561,0.284391,0.204143
v,0.900293,0.759881,0.445902,0.743193,0.984276,1.0,-0.198104,0.752206,0.598743,0.8,0.999696,0.8,0.962078,0.576844,0.79233,0.266537,0.189136
l,-0.286587,-0.252902,-0.233982,-0.197736,-0.240749,-0.198104,1.0,-0.347215,-0.166801,-0.062026,-0.196147,-0.062026,-0.218373,-0.165885,-0.22367,-0.106117,-0.164917
d,0.689543,0.669057,0.434009,0.575369,0.808113,0.752206,-0.347215,1.0,0.398162,0.574298,0.751835,0.574298,0.768188,0.502121,0.637211,0.253793,0.169629
i,0.499946,0.303031,0.213211,0.309717,0.651209,0.598743,-0.166801,0.398162,1.0,0.209268,0.598341,0.209268,0.56392,0.392551,0.572352,0.21781,0.192831
e,0.750564,0.709501,0.315538,0.757702,0.716536,0.8,-0.062026,0.574298,0.209268,1.0,0.799868,1.0,0.80907,0.384806,0.600649,0.148693,0.086036


In [4]:
for x in df:
    df.drop(df.loc[df[x]=='?'].index, inplace=True)
            

In [5]:
for i, row in df.iterrows():
    if df.at[i,'defects'] == True:
        a = random.choice([0,1])
        if a==0:
            df.drop(i, inplace=True)

print(Counter(df["defects"]))
df["defects"].replace({False: 0, True: 1}, inplace=True)

Counter({False: 8777, True: 1019})


In [6]:
(Counter(df['defects'])[1]/Counter(df['defects'])[0])*100

11.60988948387832

In [8]:
X = df.drop(columns=["defects"])
y = df.defects
print(type(y[0]))
print(y[0])

<class 'numpy.int64'>
0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
print(y_test.value_counts())
print(y_train.value_counts())

0    1752
1     108
Name: defects, dtype: int64
0    7025
1     414
Name: defects, dtype: int64


## Data Resampling Techniques

Random Oversampling

In [11]:
oversampler = RandomOverSampler(sampling_strategy=.5)
X_over, y_over = oversampler.fit_resample(X_train, y_train)
print(Counter(y_over))

Counter({0: 7025, 1: 3512})


Random Undersampling

In [12]:
undersampler = RandomUnderSampler(sampling_strategy=.5)
X_under, y_under = undersampler.fit_resample(X_train, y_train)
print(Counter(y_under))

Counter({0: 828, 1: 414})


Synthetic Minority Over Sampling

In [13]:
X_train_smote = np.asarray(X_train)
y_train_smote = np.asarray(y_train)


In [14]:
smotesampler = SMOTE()
X_smote, y_smote = smotesampler.fit_resample(X_train_smote, y_train_smote)
print(Counter(y_smote))

Counter({0: 7025, 1: 7025})


## Decision Tree

In [15]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [16]:
pred = clf.predict(X_test)

In [17]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[1651  101]
 [  90   18]]


In [18]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_dec = math.sqrt(true_positive_rate*true_negative_rate)

In [19]:
f1_dec = f1_score(y_test, pred)

In [20]:
print("g-mean = ",g_mean_dec)
print("F1-score = ",f1_dec)

g-mean =  0.39630619436943704
F1-score =  0.15859030837004404


## Bagging 

In [21]:
bag = BaggingClassifier()
bag.fit(X_train, y_train)

BaggingClassifier()

In [22]:
pred = bag.predict(X_test)

In [23]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[1749    3]
 [ 104    4]]


In [24]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_bag = math.sqrt(true_positive_rate*true_negative_rate)

In [25]:
f1_bag = f1_score(y_test, pred)

In [26]:
print("g-mean = ",g_mean_bag)
print("F1-score = ",f1_bag)

g-mean =  0.19228525022234416
F1-score =  0.06956521739130435


## AdaBoost

In [27]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

AdaBoostClassifier()

In [28]:
pred = ada.predict(X_test)

In [29]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[1751    1]
 [ 107    1]]


In [30]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ada = math.sqrt(true_positive_rate*true_negative_rate)

In [31]:
f1_ada = f1_score(y_test, pred)

In [32]:
print("g-mean = ",g_mean_ada)
print("F1-score = ",f1_ada)

g-mean =  0.0961975794598213
F1-score =  0.01818181818181818


## Easy Ensemble

In [33]:
easy_ensemble = EasyEnsembleClassifier(random_state=42)
easy_ensemble.fit(X_smote, y_smote)

EasyEnsembleClassifier(random_state=42)

In [34]:
pred = easy_ensemble.predict(X_test)

In [35]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[1407  345]
 [  57   51]]


In [36]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ez = math.sqrt(true_positive_rate*true_negative_rate)

In [37]:
f1_ez = f1_score(y_test, pred)

In [38]:
print("g-mean = ",g_mean_ez)
print("F1-score = ",f1_ez)

g-mean =  0.615819175756839
F1-score =  0.20238095238095236


## RusBoost

In [39]:
rus = AdaBoostClassifier()
rus.fit(X_under, y_under)

AdaBoostClassifier()

In [40]:
pred = rus.predict(X_test)

In [41]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[1478  274]
 [  59   49]]


In [42]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_rus = math.sqrt(true_positive_rate*true_negative_rate)

In [43]:
f1_rus = f1_score(y_test, pred)

In [44]:
print("g-mean = ",g_mean_rus)
print("F1-score = ",f1_rus)

g-mean =  0.6186661128385001
F1-score =  0.2273781902552204


## Smote Boost

In [45]:
smote_boost = AdaBoostClassifier()
smote_boost.fit(X_smote,y_smote)

AdaBoostClassifier()

In [46]:
pred = smote_boost.predict(X_test)

In [47]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[1407  345]
 [  57   51]]


In [48]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_smb = math.sqrt(true_positive_rate*true_negative_rate)

In [49]:
f1_smb = f1_score(y_test, pred)

In [50]:
print("g-mean = ",g_mean_smb)
print("F1-score = ",f1_smb)

g-mean =  0.615819175756839
F1-score =  0.20238095238095236


## Under Bagging

In [51]:
under_bagging = BaggingClassifier()
under_bagging.fit(X_under, y_under)

BaggingClassifier()

In [52]:
pred = under_bagging.predict(X_test)

In [53]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[1466  286]
 [  68   40]]


In [54]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ubag = math.sqrt(true_positive_rate*true_negative_rate)

In [55]:
f1_ubag = f1_score(y_test, pred)

In [56]:
print("g-mean = ",g_mean_ubag)
print("F1-score = ",f1_ubag)

g-mean =  0.5566959376427967
F1-score =  0.18433179723502305


## Over Bagging

In [57]:
over_bagging = BaggingClassifier()
over_bagging.fit(X_over, y_over)

BaggingClassifier()

In [58]:
pred = over_bagging.predict(X_test)

In [59]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[1721   31]
 [ 103    5]]


In [60]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_obag = math.sqrt(true_positive_rate*true_negative_rate)

In [61]:
f1_obag = f1_score(y_test, pred)

In [62]:
print("g-mean = ",g_mean_obag)
print("F1-score = ",f1_obag)

g-mean =  0.21325366742461674
F1-score =  0.06944444444444445


## Results

In [63]:
models = ["Decision Tree","Bagging", "AdaBoost", "Easy Ensemble","RusBoost","Smote Boost","Under Bagging", "Over Bagging"]
g_mean_score = [g_mean_dec,g_mean_bag,g_mean_ada,g_mean_ez,g_mean_rus,g_mean_smb,g_mean_ubag,g_mean_obag]
F1_score = [f1_dec,f1_bag,f1_ada,f1_ez,f1_rus,f1_smb,f1_ubag,f1_obag]

In [64]:
result_data = {
    'Models': models,
    'g-mean': g_mean_score,
    'F1-score': F1_score
}
result_df = pd.DataFrame(result_data)

In [65]:
result_df

Unnamed: 0,Models,g-mean,F1-score
0,Decision Tree,0.396306,0.15859
1,Bagging,0.192285,0.069565
2,AdaBoost,0.096198,0.018182
3,Easy Ensemble,0.615819,0.202381
4,RusBoost,0.618666,0.227378
5,Smote Boost,0.615819,0.202381
6,Under Bagging,0.556696,0.184332
7,Over Bagging,0.213254,0.069444
