# ECIP Models for Software Defect Prediction Dataset

## Import

In [72]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix
import pandas as pd
import numpy as np
from collections import Counter
import random
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier

import math


In [73]:
df = pd.read_csv("./airline.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,...,5,5,5,5,2,5,5,50,44.0,satisfied
1,1,90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,...,4,4,4,4,3,4,5,0,0.0,satisfied
2,2,12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,...,2,4,1,3,2,2,2,0,0.0,neutral or dissatisfied
3,3,77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,...,1,1,1,1,3,1,4,0,6.0,satisfied
4,4,36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,...,2,2,2,2,4,2,4,0,20.0,satisfied


In [74]:
le = LabelEncoder()
df["Gender"] = le.fit_transform(df["Gender"])
df["Customer Type"] = le.fit_transform(df["Customer Type"])
df["Type of Travel"] = le.fit_transform(df["Type of Travel"])
df["Class"] = le.fit_transform(df["Class"])
df["satisfaction"] = le.fit_transform(df["satisfaction"])

df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,19556,0,0,52,0,1,160,5,4,...,5,5,5,5,2,5,5,50,44.0,1
1,1,90035,0,0,36,0,0,2863,1,1,...,4,4,4,4,3,4,5,0,0.0,1
2,2,12360,1,1,20,0,1,192,2,0,...,2,4,1,3,2,2,2,0,0.0,0
3,3,77959,1,0,44,0,0,3377,0,0,...,1,1,1,1,3,1,4,0,6.0,1
4,4,36875,0,0,49,0,1,1182,2,3,...,2,2,2,2,4,2,4,0,20.0,1


In [75]:
df.columns

Index(['Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

In [76]:
df.drop("Unnamed: 0",axis=1,inplace=True)

## Data Preprocessing

In [77]:
df.corr()

Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
id,1.0,-0.004371,0.0067,0.010211,-0.006564,-0.105987,0.095335,-0.030303,-0.002502,0.010389,...,-0.001078,0.056544,0.041921,0.073085,0.079521,0.076587,0.020411,-0.009884,-0.02909,0.010965
Gender,-0.004371,1.0,-0.027878,0.009261,0.01993,-0.006411,-0.005222,-0.006323,0.008417,0.001703,...,-0.005217,-0.000423,0.028051,0.032202,7.9e-05,0.03489,-0.011605,0.005775,0.005088,0.007335
Customer Type,0.0067,-0.027878,1.0,-0.29421,-0.307827,0.044415,-0.229306,0.001333,-0.206864,-0.01234,...,-0.090418,-0.044628,-0.042757,0.02547,-0.027565,0.025729,-0.07215,0.0032,0.004709,-0.179632
Age,0.010211,0.009261,-0.29421,1.0,-0.029773,-0.115125,0.099409,0.009242,0.032449,0.013565,...,0.068998,0.054977,0.033299,-0.049863,0.025388,-0.059083,0.048418,-0.004334,-0.0074,0.121697
Type of Travel,-0.006564,0.01993,-0.307827,-0.029773,1.0,0.485258,-0.264559,-0.109021,0.246425,-0.13635,...,-0.171904,-0.072902,-0.142869,-0.039824,0.014292,-0.027235,-0.106232,-0.008003,-0.006531,-0.453268
Class,-0.105987,-0.006411,0.044415,-0.115125,0.485258,1.0,-0.425756,-0.032293,0.07617,-0.098329,...,-0.199558,-0.222258,-0.204803,-0.176942,-0.158999,-0.161973,-0.144021,0.006455,0.011941,-0.443715
Flight Distance,0.095335,-0.005222,-0.229306,0.099409,-0.264559,-0.425756,1.0,0.005007,-0.014401,0.062989,...,0.137538,0.11788,0.136995,0.071549,0.07572,0.066355,0.105578,0.003446,0.000131,0.295292
Inflight wifi service,-0.030303,-0.006323,0.001333,0.009242,-0.109021,-0.032293,0.005007,1.0,0.349137,0.710684,...,0.201782,0.113658,0.159699,0.118199,0.046046,0.108419,0.125768,-0.010078,-0.012277,0.280395
Departure/Arrival time convenient,-0.002502,0.008417,-0.206864,0.032449,0.246425,0.07617,-0.014401,0.349137,1.0,0.44023,...,-0.022326,0.060982,0.003373,0.065684,0.082461,0.067804,-0.00767,-0.000238,-0.001345,-0.064798
Ease of Online booking,0.010389,0.001703,-0.01234,0.013565,-0.13635,-0.098329,0.062989,0.710684,0.44023,1.0,...,0.044715,0.039988,0.116754,0.040685,-0.000108,0.035769,0.010974,-0.001062,-0.003161,0.157709


In [78]:
columns = []
for i, row in df.corr().iterrows():
    columns.append(i) if abs(row["satisfaction"]) > 0.2 else None

columns

['Type of Travel',
 'Class',
 'Flight Distance',
 'Inflight wifi service',
 'Food and drink',
 'Online boarding',
 'Seat comfort',
 'Inflight entertainment',
 'On-board service',
 'Leg room service',
 'Baggage handling',
 'Checkin service',
 'Inflight service',
 'Cleanliness',
 'satisfaction']

In [79]:
df = df[columns]

In [80]:
for x in df:
    df.drop(df.loc[df[x]=='?'].index, inplace=True)
            
df.isnull().sum()

Type of Travel            0
Class                     0
Flight Distance           0
Inflight wifi service     0
Food and drink            0
Online boarding           0
Seat comfort              0
Inflight entertainment    0
On-board service          0
Leg room service          0
Baggage handling          0
Checkin service           0
Inflight service          0
Cleanliness               0
satisfaction              0
dtype: int64

In [82]:
for i, row in df.iterrows():
    if df.at[i,'satisfaction'] == True:
        a = random.choice([0,1])
        if a==0:
            df.drop(i, inplace=True)

print(Counter(df["satisfaction"]))
df["satisfaction"].replace({False: 0, True: 1}, inplace=True)

Counter({0: 14573, 1: 2895})


In [141]:
(Counter(df['satisfaction'])[1]/Counter(df['satisfaction'])[0])*100

19.86550470047348

In [83]:
X = df.drop(columns=["satisfaction"])
y = df.satisfaction 
# print(type(y[0]))
print(y)

1        1
2        0
4        1
6        1
11       0
        ..
25969    1
25970    0
25971    0
25973    0
25975    0
Name: satisfaction, Length: 17468, dtype: int32


In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [85]:
print(y_test.value_counts())
print(y_train.value_counts())

0    2897
1     597
Name: satisfaction, dtype: int64
0    11676
1     2298
Name: satisfaction, dtype: int64


## Data Resampling Techniques

Random Oversampling

In [86]:
oversampler = RandomOverSampler(sampling_strategy=.5)
X_over, y_over = oversampler.fit_resample(X_train, y_train)
print(Counter(y_over))

Counter({0: 11676, 1: 5838})


Random Undersampling

In [87]:
undersampler = RandomUnderSampler(sampling_strategy=.5)
X_under, y_under = undersampler.fit_resample(X_train, y_train)
print(Counter(y_under))

Counter({0: 4596, 1: 2298})


Synthetic Minority Over Sampling

In [88]:
X_train_smote = np.asarray(X_train)
y_train_smote = np.asarray(y_train)


In [89]:
smotesampler = SMOTE()
X_smote, y_smote = smotesampler.fit_resample(X_train_smote, y_train_smote)
print(Counter(y_smote))

Counter({0: 11676, 1: 11676})


## Decision Tree

In [90]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [91]:
pred = clf.predict(X_test)

In [92]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[2794  103]
 [  78  519]]


In [93]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_dec = math.sqrt(true_positive_rate*true_negative_rate)

In [94]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_dec = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_dec = f1_score(y_test, pred)

In [95]:
print("g-mean = ",g_mean_dec)
print("Balance = ",balance_dec)

g-mean =  0.915662580481597
Balance =  0.08187152615368432


## Bagging 

In [96]:
bag = BaggingClassifier()
bag.fit(X_train, y_train)

BaggingClassifier()

In [97]:
pred = bag.predict(X_test)

In [98]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[2859   38]
 [  87  510]]


In [99]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_bag = math.sqrt(true_positive_rate*true_negative_rate)

In [100]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_bag = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_bag = f1_score(y_test, pred)

In [101]:
print("g-mean = ",g_mean_bag)
print("Balance = ",balance_bag)

g-mean =  0.9181861817507572
Balance =  0.07703803654662


## AdaBoost

In [102]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

AdaBoostClassifier()

In [103]:
pred = ada.predict(X_test)

In [104]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[2837   60]
 [ 140  457]]


In [105]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ada = math.sqrt(true_positive_rate*true_negative_rate)

In [106]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_ada = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_ada = f1_score(y_test, pred)

In [107]:
print("g-mean = ",g_mean_ada)
print("Balance = ",balance_ada)

g-mean =  0.8658174904144188
Balance =  0.12108360399895945


## Easy Ensemble

In [108]:
easy_ensemble = EasyEnsembleClassifier(random_state=42)
easy_ensemble.fit(X_smote, y_smote)

EasyEnsembleClassifier(random_state=42)

In [109]:
pred = easy_ensemble.predict(X_test)

In [110]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[2591  306]
 [  66  531]]


In [111]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ez = math.sqrt(true_positive_rate*true_negative_rate)

In [112]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_ez = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_ez = f1_score(y_test, pred)

In [113]:
print("g-mean = ",g_mean_ez)
print("Balance = ",balance_ez)

g-mean =  0.8919069618694807
Balance =  0.10808623588449373


## RusBoost

In [114]:
rus = AdaBoostClassifier()
rus.fit(X_under, y_under)

AdaBoostClassifier()

In [115]:
pred = rus.predict(X_test)

In [116]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[2767  130]
 [  83  514]]


In [117]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_rus = math.sqrt(true_positive_rate*true_negative_rate)

In [118]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_rus = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_rus = f1_score(y_test, pred)

In [119]:
print("g-mean = ",g_mean_rus)
print("Balance = ",balance_rus)

g-mean =  0.906827591975967
Balance =  0.09073171533248425


## Smote Boost

In [120]:
smote_boost = AdaBoostClassifier()
smote_boost.fit(X_smote,y_smote)

AdaBoostClassifier()

In [121]:
pred = smote_boost.predict(X_test)

In [122]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[2591  306]
 [  66  531]]


In [123]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_smb = math.sqrt(true_positive_rate*true_negative_rate)

In [124]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_smb = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_smb = f1_score(y_test, pred)

In [125]:
print("g-mean = ",g_mean_smb)
print("Balance = ",balance_smb)

g-mean =  0.8919069618694807
Balance =  0.10808623588449373


## Under Bagging

In [126]:
under_bagging = BaggingClassifier()
under_bagging.fit(X_under, y_under)

BaggingClassifier()

In [127]:
pred = under_bagging.predict(X_test)

In [128]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[2804   93]
 [  65  532]]


In [129]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ubag = math.sqrt(true_positive_rate*true_negative_rate)

In [130]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_ubag = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_ubag = f1_score(y_test, pred)

In [131]:
print("g-mean = ",g_mean_ubag)
print("Balance = ",balance_ubag)

g-mean =  0.9287170263542498
Balance =  0.06969759896562588


## Over Bagging

In [132]:
over_bagging = BaggingClassifier()
over_bagging.fit(X_over, y_over)

BaggingClassifier()

In [133]:
pred = over_bagging.predict(X_test)

In [134]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[2844   53]
 [  81  516]]


In [135]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_obag = math.sqrt(true_positive_rate*true_negative_rate)

In [136]:
false_positive_rate = conf[0][1]/(conf[0][1]+conf[0][0])
balance_obag = 1 - math.sqrt((true_positive_rate)**2 + (1-false_positive_rate)**2)/math.sqrt(2)
f1_obag = f1_score(y_test, pred)

In [137]:
print("g-mean = ",g_mean_obag)
print("Balance = ",balance_obag)

g-mean =  0.9211454975771783
Balance =  0.07512244973733051


## Results

In [138]:
models = ["Decision Tree","Bagging", "AdaBoost", "Easy Ensemble","RusBoost","Smote Boost","Under Bagging", "Over Bagging"]
g_mean_score = [g_mean_dec,g_mean_bag,g_mean_ada,g_mean_ez,g_mean_rus,g_mean_smb,g_mean_ubag,g_mean_obag]
f1_score = [f1_dec,f1_bag,f1_ada,f1_ez,f1_rus,f1_smb,f1_ubag,f1_obag]

In [139]:
result_data = {
    'Models': models,
    'g-mean': g_mean_score,
    'F1_score': f1_score
}
result_df = pd.DataFrame(result_data)

In [140]:
result_df

Unnamed: 0,Models,g-mean,F1_score
0,Decision Tree,0.915663,0.851518
1,Bagging,0.918186,0.89083
2,AdaBoost,0.865817,0.820467
3,Easy Ensemble,0.891907,0.740586
4,RusBoost,0.906828,0.828364
5,Smote Boost,0.891907,0.740586
6,Under Bagging,0.928717,0.870704
7,Over Bagging,0.921145,0.885077
