# ECIP Models for Software Defect Prediction Dataset

## Import

In [29]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix
import pandas as pd
import numpy as np
from collections import Counter
import random
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.metrics import f1_score

import math


In [30]:
df = pd.read_csv("./heart_failure.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [31]:
le = LabelEncoder()
df['Sex'] = le.fit_transform(df["Sex"])
df["ChestPainType"] = le.fit_transform(df["ChestPainType"])
df["RestingECG"] = le.fit_transform(df["RestingECG"])
df["ExerciseAngina"] = le.fit_transform(df["ExerciseAngina"])
df["ST_Slope"] = le.fit_transform(df["ST_Slope"])
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0


## Data Preprocessing

In [32]:
df.corr()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
Age,1.0,0.05575,-0.07715,0.254399,-0.095282,0.198039,-0.007484,-0.382045,0.215793,0.258612,-0.268264,0.282039
Sex,0.05575,1.0,-0.126559,0.005133,-0.200092,0.120076,0.071552,-0.189186,0.190664,0.105734,-0.150693,0.305445
ChestPainType,-0.07715,-0.126559,1.0,-0.020647,0.06788,-0.073151,-0.072537,0.289123,-0.354727,-0.177377,0.213521,-0.386828
RestingBP,0.254399,0.005133,-0.020647,1.0,0.100893,0.070193,0.022656,-0.112135,0.155101,0.164803,-0.075162,0.107589
Cholesterol,-0.095282,-0.200092,0.06788,0.100893,1.0,-0.260974,-0.196544,0.235792,-0.034166,0.050148,0.111471,-0.232741
FastingBS,0.198039,0.120076,-0.073151,0.070193,-0.260974,1.0,0.08705,-0.131438,0.060451,0.052698,-0.175774,0.267291
RestingECG,-0.007484,0.071552,-0.072537,0.022656,-0.196544,0.08705,1.0,-0.179276,0.0775,-0.020438,-0.006778,0.057384
MaxHR,-0.382045,-0.189186,0.289123,-0.112135,0.235792,-0.131438,-0.179276,1.0,-0.370425,-0.160691,0.343419,-0.400421
ExerciseAngina,0.215793,0.190664,-0.354727,0.155101,-0.034166,0.060451,0.0775,-0.370425,1.0,0.408752,-0.428706,0.494282
Oldpeak,0.258612,0.105734,-0.177377,0.164803,0.050148,0.052698,-0.020438,-0.160691,0.408752,1.0,-0.501921,0.403951


In [33]:
for x in df:
    df.drop(df.loc[df[x]=='?'].index, inplace=True)

print(df.isna().sum())      

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64


In [34]:
Counter(df['HeartDisease'])

Counter({0: 410, 1: 508})

In [41]:
for i, row in df.iterrows():
    if df.at[i,'HeartDisease'] == True:
        a = random.choice([0,1])
        if a==0:
            df.drop(i, inplace=True)

print(Counter(df["HeartDisease"]))
df["HeartDisease"].replace({False: 0, True: 1}, inplace=True)

Counter({0: 410, 1: 59})


In [134]:
(Counter(df['HeartDisease'])[1]/Counter(df['HeartDisease'])[0])*100

14.390243902439023

In [42]:
X = df.drop(columns=["HeartDisease"])
y = df.HeartDisease
print(type(y[0]))
print(y[0])

<class 'numpy.int64'>
0


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
print(y_test.value_counts())
print(y_train.value_counts())

0    78
1    16
Name: HeartDisease, dtype: int64
0    332
1     43
Name: HeartDisease, dtype: int64


## Data Resampling Techniques

Random Oversampling

In [45]:
oversampler = RandomOverSampler(sampling_strategy=.5)
X_over, y_over = oversampler.fit_resample(X_train, y_train)
print(Counter(y_over))

Counter({0: 332, 1: 166})


Random Undersampling

In [46]:
undersampler = RandomUnderSampler(sampling_strategy=.5)
X_under, y_under = undersampler.fit_resample(X_train, y_train)
print(Counter(y_under))

Counter({0: 86, 1: 43})


Synthetic Minority Over Sampling

In [47]:
X_train_smote = np.asarray(X_train)
y_train_smote = np.asarray(y_train)


In [48]:
smotesampler = SMOTE()
X_smote, y_smote = smotesampler.fit_resample(X_train_smote, y_train_smote)
print(Counter(y_smote))

Counter({0: 332, 1: 332})


## Decision Tree

In [49]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [50]:
pred = clf.predict(X_test)

In [51]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[75  3]
 [10  6]]


In [52]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_dec = math.sqrt(true_positive_rate*true_negative_rate)

In [53]:
f1_dec = f1_score(y_test, pred)

In [54]:
print("g-mean = ",g_mean_dec)
print("F1-score = ",f1_dec)

g-mean =  0.6004805767690767
F1-score =  0.4800000000000001


## Bagging 

In [55]:
bag = BaggingClassifier()
bag.fit(X_train, y_train)

BaggingClassifier()

In [56]:
pred = bag.predict(X_test)

In [57]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[75  3]
 [ 8  8]]


In [58]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_bag = math.sqrt(true_positive_rate*true_negative_rate)

In [59]:
f1_bag = f1_score(y_test, pred)

In [60]:
print("g-mean = ",g_mean_bag)
print("F1-score = ",f1_bag)

g-mean =  0.6933752452815364
F1-score =  0.5925925925925926


## AdaBoost

In [61]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

AdaBoostClassifier()

In [62]:
pred = ada.predict(X_test)

In [63]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[73  5]
 [ 7  9]]


In [64]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ada = math.sqrt(true_positive_rate*true_negative_rate)

In [98]:
f1_ada = f1_score(y_test, pred)

In [99]:
print("g-mean = ",g_mean_ada)
print("F1-score = ",f1_ada)

g-mean =  0.725563441535134
F1-score =  0.5


## Easy Ensemble

In [100]:
easy_ensemble = EasyEnsembleClassifier(random_state=42)
easy_ensemble.fit(X_smote, y_smote)

EasyEnsembleClassifier(random_state=42)

In [101]:
pred = easy_ensemble.predict(X_test)

In [102]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[73  5]
 [ 6 10]]


In [103]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ez = math.sqrt(true_positive_rate*true_negative_rate)

In [104]:
f1_ez = f1_score(y_test, pred)

In [105]:
print("g-mean = ",g_mean_ez)
print("F1-score = ",f1_ez)

g-mean =  0.7648110207338134
F1-score =  0.6451612903225806


## RusBoost

In [106]:
rus = AdaBoostClassifier()
rus.fit(X_under, y_under)

AdaBoostClassifier()

In [107]:
pred = rus.predict(X_test)

In [108]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[66 12]
 [ 5 11]]


In [109]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_rus = math.sqrt(true_positive_rate*true_negative_rate)

In [110]:
f1_rus = f1_score(y_test, pred)

In [111]:
print("g-mean = ",g_mean_rus)
print("F1-score = ",f1_rus)

g-mean =  0.76271276980969
F1-score =  0.5641025641025642


## Smote Boost

In [112]:
smote_boost = AdaBoostClassifier()
smote_boost.fit(X_smote,y_smote)

AdaBoostClassifier()

In [113]:
pred = smote_boost.predict(X_test)

In [114]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[73  5]
 [ 6 10]]


In [115]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_smb = math.sqrt(true_positive_rate*true_negative_rate)

In [116]:
f1_smb = f1_score(y_test, pred)

In [117]:
print("g-mean = ",g_mean_smb)
print("F1-score = ",f1_smb)

g-mean =  0.7648110207338134
F1-score =  0.6451612903225806


## Under Bagging

In [118]:
under_bagging = BaggingClassifier()
under_bagging.fit(X_under, y_under)

BaggingClassifier()

In [119]:
pred = under_bagging.predict(X_test)

In [120]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[76  2]
 [ 7  9]]


In [121]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ubag = math.sqrt(true_positive_rate*true_negative_rate)

In [122]:
f1_ubag = f1_score(y_test, pred)

In [123]:
print("g-mean = ",g_mean_ubag)
print("F1-score = ",f1_ubag)

g-mean =  0.7403221751892368
F1-score =  0.6666666666666666


## Over Bagging

In [124]:
over_bagging = BaggingClassifier()
over_bagging.fit(X_over, y_over)

BaggingClassifier()

In [125]:
pred = over_bagging.predict(X_test)

In [126]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[77  1]
 [ 7  9]]


In [127]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_obag = math.sqrt(true_positive_rate*true_negative_rate)

In [128]:
f1_obag = f1_score(y_test, pred)

In [129]:
print("g-mean = ",g_mean_obag)
print("F1-score = ",f1_obag)

g-mean =  0.7451767988460601
F1-score =  0.6923076923076923


## Results

In [130]:
models = ["Decision Tree","Bagging", "AdaBoost", "Easy Ensemble","RusBoost","Smote Boost","Under Bagging", "Over Bagging"]
g_mean_score = [g_mean_dec,g_mean_bag,g_mean_ada,g_mean_ez,g_mean_rus,g_mean_smb,g_mean_ubag,g_mean_obag]
F1_score = [f1_dec,f1_bag,f1_ada,f1_ez,f1_rus,f1_smb,f1_ubag,f1_obag]

In [131]:
result_data = {
    'Models': models,
    'g-mean': g_mean_score,
    'F1-score': F1_score
}
result_df = pd.DataFrame(result_data)

In [132]:
result_df

Unnamed: 0,Models,g-mean,F1-score
0,Decision Tree,0.600481,0.48
1,Bagging,0.693375,0.592593
2,AdaBoost,0.725563,0.5
3,Easy Ensemble,0.764811,0.645161
4,RusBoost,0.762713,0.564103
5,Smote Boost,0.764811,0.645161
6,Under Bagging,0.740322,0.666667
7,Over Bagging,0.745177,0.692308
