# ECIP Models for Software Defect Prediction Dataset

## Import

In [2]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix
import pandas as pd
import numpy as np
from collections import Counter
import random
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier

import math


In [3]:
df = pd.read_csv("./student-mat.csv")
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [4]:
df.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [5]:
le = LabelEncoder()
df["sex"] = le.fit_transform(df["sex"])
df["romantic"] = le.fit_transform(df["romantic"])

df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,0,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,0,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,0,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,0,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,0,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


## Data Preprocessing

In [6]:
df.corr()

Unnamed: 0,sex,age,Medu,Fedu,traveltime,studytime,failures,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
sex,1.0,-0.028606,0.078228,0.034878,0.059722,-0.306268,0.044436,-0.102023,0.058971,0.238744,0.075897,0.268171,0.274194,0.143588,-0.066962,0.091839,0.091099,0.103456
age,-0.028606,1.0,-0.163658,-0.163438,0.070641,-0.00414,0.243665,0.164669,0.05394,0.016434,0.126964,0.131125,0.117276,-0.062187,0.17523,-0.064081,-0.143474,-0.161579
Medu,0.078228,-0.163658,1.0,0.623455,-0.171639,0.064944,-0.23668,0.039681,-0.003914,0.030891,0.064094,0.019834,-0.047123,-0.046878,0.100285,0.205341,0.215527,0.217147
Fedu,0.034878,-0.163438,0.623455,1.0,-0.158194,-0.009175,-0.250408,0.015602,-0.00137,-0.012846,0.043105,0.002386,-0.012631,0.014742,0.024473,0.19027,0.164893,0.152457
traveltime,0.059722,0.070641,-0.171639,-0.158194,1.0,-0.100909,0.092239,0.021962,-0.016808,-0.017025,0.02854,0.138325,0.134116,0.007501,-0.012944,-0.09304,-0.153198,-0.117142
studytime,-0.306268,-0.00414,0.064944,-0.009175,-0.100909,1.0,-0.173563,0.053285,0.039731,-0.143198,-0.063904,-0.196019,-0.253785,-0.075616,-0.0627,0.160612,0.13588,0.09782
failures,0.044436,0.243665,-0.23668,-0.250408,0.092239,-0.173563,1.0,0.093137,-0.044337,0.091987,0.124561,0.136047,0.141962,0.065827,0.063726,-0.354718,-0.355896,-0.360415
romantic,-0.102023,0.164669,0.039681,0.015602,0.021962,0.053285,0.093137,1.0,-0.063816,-0.011182,0.00787,0.015121,-0.010141,0.026342,0.153384,-0.037188,-0.111774,-0.12997
famrel,0.058971,0.05394,-0.003914,-0.00137,-0.016808,0.039731,-0.044337,-0.063816,1.0,0.150701,0.064568,-0.077594,-0.113397,0.094056,-0.044354,0.022168,-0.018281,0.051363
freetime,0.238744,0.016434,0.030891,-0.012846,-0.017025,-0.143198,0.091987,-0.011182,0.150701,1.0,0.285019,0.209001,0.147822,0.075733,-0.058078,0.012613,-0.013777,0.011307


In [7]:
columns = []
for i, row in df.corr().iterrows():
    columns.append(i) if abs(row["romantic"]) > 0.02 else None

columns

['sex',
 'age',
 'Medu',
 'traveltime',
 'studytime',
 'failures',
 'romantic',
 'famrel',
 'health',
 'absences',
 'G1',
 'G2',
 'G3']

In [8]:
df = df[columns]

In [9]:
for x in df:
    df.drop(df.loc[df[x]=='?'].index, inplace=True)
            
df.isnull().sum()

sex           0
age           0
Medu          0
traveltime    0
studytime     0
failures      0
romantic      0
famrel        0
health        0
absences      0
G1            0
G2            0
G3            0
dtype: int64

In [10]:
for i, row in df.iterrows():
    if df.at[i,'romantic'] == True:
        a = random.choice([0,1])
        if a==0:
            df.drop(i, inplace=True)

print(Counter(df["romantic"]))

Counter({0: 263, 1: 64})


In [69]:
(Counter(df['romantic'])[1]/Counter(df['romantic'])[0])*100

24.334600760456272

In [11]:
X = df.drop(columns=["romantic"])
y = df.romantic 
print(type(y[0]))
print(y)

<class 'numpy.int32'>
0      0
1      0
2      0
4      0
5      0
      ..
390    0
391    0
392    0
393    0
394    0
Name: romantic, Length: 327, dtype: int32


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
print(y_test.value_counts())
print(y_train.value_counts())

0    57
1     9
Name: romantic, dtype: int64
0    206
1     55
Name: romantic, dtype: int64


## Data Resampling Techniques

Random Oversampling

In [14]:
oversampler = RandomOverSampler(sampling_strategy=.5)
X_over, y_over = oversampler.fit_resample(X_train, y_train)
print(Counter(y_over))

Counter({0: 206, 1: 103})


Random Undersampling

In [15]:
undersampler = RandomUnderSampler(sampling_strategy=.5)
X_under, y_under = undersampler.fit_resample(X_train, y_train)
print(Counter(y_under))

Counter({0: 110, 1: 55})


Synthetic Minority Over Sampling

In [16]:
X_train_smote = np.asarray(X_train)
y_train_smote = np.asarray(y_train)


In [17]:
smotesampler = SMOTE()
X_smote, y_smote = smotesampler.fit_resample(X_train_smote, y_train_smote)
print(Counter(y_smote))

Counter({0: 206, 1: 206})


## Decision Tree

In [18]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [19]:
pred = clf.predict(X_test)

In [20]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[43 14]
 [ 4  5]]


In [21]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_dec = math.sqrt(true_positive_rate*true_negative_rate)

In [22]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_dec = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_dec = f1_score(y_test, pred)

In [23]:
print("g-mean = ",g_mean_dec)
print("Balance = ",balance_dec)

g-mean =  0.6473818918074215
Balance =  0.33752729891516486


## Bagging 

In [24]:
bag = BaggingClassifier()
bag.fit(X_train, y_train)

BaggingClassifier()

In [25]:
pred = bag.predict(X_test)

In [26]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[52  5]
 [ 6  3]]


In [27]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_bag = math.sqrt(true_positive_rate*true_negative_rate)

In [28]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_bag = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_bag = f1_score(y_test, pred)

In [29]:
print("g-mean = ",g_mean_bag)
print("Balance = ",balance_bag)

g-mean =  0.5514467945790074
Balance =  0.31320774978724975


## AdaBoost

In [30]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

AdaBoostClassifier()

In [31]:
pred = ada.predict(X_test)

In [32]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[51  6]
 [ 7  2]]


In [33]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ada = math.sqrt(true_positive_rate*true_negative_rate)

In [34]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_ada = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_ada = f1_score(y_test, pred)

In [35]:
print("g-mean = ",g_mean_ada)
print("Balance = ",balance_ada)

g-mean =  0.4459040360399591
Balance =  0.34810402184478806


## Easy Ensemble

In [36]:
easy_ensemble = EasyEnsembleClassifier(random_state=42)
easy_ensemble.fit(X_smote, y_smote)

EasyEnsembleClassifier(random_state=42)

In [37]:
pred = easy_ensemble.predict(X_test)

In [38]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[42 15]
 [ 5  4]]


In [39]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ez = math.sqrt(true_positive_rate*true_negative_rate)

In [40]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_ez = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_ez = f1_score(y_test, pred)

In [41]:
print("g-mean = ",g_mean_ez)
print("Balance = ",balance_ez)

g-mean =  0.5722633835193014
Balance =  0.39153177885522505


## RusBoost

In [42]:
rus = AdaBoostClassifier()
rus.fit(X_under, y_under)

AdaBoostClassifier()

In [43]:
pred = rus.predict(X_test)

In [44]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[43 14]
 [ 3  6]]


In [45]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_rus = math.sqrt(true_positive_rate*true_negative_rate)

In [46]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_rus = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_rus = f1_score(y_test, pred)

In [47]:
print("g-mean = ",g_mean_rus)
print("Balance = ",balance_rus)

g-mean =  0.7091713309265872
Balance =  0.28812127841145685


## Smote Boost

In [48]:
smote_boost = AdaBoostClassifier()
smote_boost.fit(X_smote,y_smote)

AdaBoostClassifier()

In [49]:
pred = smote_boost.predict(X_test)

In [50]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[42 15]
 [ 5  4]]


In [51]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_smb = math.sqrt(true_positive_rate*true_negative_rate)

In [52]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_smb = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_smb = f1_score(y_test, pred)

In [53]:
print("g-mean = ",g_mean_smb)
print("Balance = ",balance_smb)

g-mean =  0.5722633835193014
Balance =  0.39153177885522505


## Under Bagging

In [54]:
under_bagging = BaggingClassifier()
under_bagging.fit(X_under, y_under)

BaggingClassifier()

In [55]:
pred = under_bagging.predict(X_test)

In [56]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[48  9]
 [ 4  5]]


In [57]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ubag = math.sqrt(true_positive_rate*true_negative_rate)

In [58]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_ubag = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_ubag = f1_score(y_test, pred)

In [59]:
print("g-mean = ",g_mean_ubag)
print("Balance = ",balance_ubag)

g-mean =  0.6839855680567694
Balance =  0.28663359710914726


## Over Bagging

In [60]:
over_bagging = BaggingClassifier()
over_bagging.fit(X_over, y_over)

BaggingClassifier()

In [61]:
pred = over_bagging.predict(X_test)

In [62]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[52  5]
 [ 7  2]]


In [63]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_obag = math.sqrt(true_positive_rate*true_negative_rate)

In [64]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_obag = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_obag = f1_score(y_test, pred)

In [65]:
print("g-mean = ",g_mean_obag)
print("Balance = ",balance_obag)

g-mean =  0.4502544223373136
Balance =  0.3360576851702938


## Results

In [66]:
models = ["Decision Tree","Bagging", "AdaBoost", "Easy Ensemble","RusBoost","Smote Boost","Under Bagging", "Over Bagging"]
g_mean_score = [g_mean_dec,g_mean_bag,g_mean_ada,g_mean_ez,g_mean_rus,g_mean_smb,g_mean_ubag,g_mean_obag]
f1_score = [f1_dec,f1_bag,f1_ada,f1_ez,f1_rus,f1_smb,f1_ubag,f1_obag]


In [67]:
result_data = {
    'Models': models,
    'g-mean': g_mean_score,
    'F1 Score': f1_score
}
result_df = pd.DataFrame(result_data)

In [68]:
result_df

Unnamed: 0,Models,g-mean,F1 Score
0,Decision Tree,0.647382,0.357143
1,Bagging,0.551447,0.352941
2,AdaBoost,0.445904,0.235294
3,Easy Ensemble,0.572263,0.285714
4,RusBoost,0.709171,0.413793
5,Smote Boost,0.572263,0.285714
6,Under Bagging,0.683986,0.434783
7,Over Bagging,0.450254,0.25
