# ECIP Models for Software Defect Prediction Dataset

## Import

In [1]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score, confusion_matrix
import pandas as pd
import numpy as np
from collections import Counter
import random
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier

import math


In [2]:
df = pd.read_csv("./water_potability.csv")
df.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


## Data Preprocessing

In [3]:
df.corr()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
ph,1.0,0.082096,-0.089288,-0.03435,0.018203,0.018614,0.043503,0.003354,-0.039057,-0.003556
Hardness,0.082096,1.0,-0.046899,-0.030054,-0.106923,-0.023915,0.00361,-0.013013,-0.014449,-0.013837
Solids,-0.089288,-0.046899,1.0,-0.070148,-0.171804,0.013831,0.010242,-0.009143,0.019546,0.033743
Chloramines,-0.03435,-0.030054,-0.070148,1.0,0.027244,-0.020486,-0.012653,0.017084,0.002363,0.023779
Sulfate,0.018203,-0.106923,-0.171804,0.027244,1.0,-0.016121,0.030831,-0.030274,-0.011187,-0.023577
Conductivity,0.018614,-0.023915,0.013831,-0.020486,-0.016121,1.0,0.020966,0.001285,0.005798,-0.008128
Organic_carbon,0.043503,0.00361,0.010242,-0.012653,0.030831,0.020966,1.0,-0.013274,-0.027308,-0.030001
Trihalomethanes,0.003354,-0.013013,-0.009143,0.017084,-0.030274,0.001285,-0.013274,1.0,-0.022145,0.00713
Turbidity,-0.039057,-0.014449,0.019546,0.002363,-0.011187,0.005798,-0.027308,-0.022145,1.0,0.001581
Potability,-0.003556,-0.013837,0.033743,0.023779,-0.023577,-0.008128,-0.030001,0.00713,0.001581,1.0


In [4]:
for x in df:
    df.drop(df.loc[df[x]=='?'].index, inplace=True)
            
print(df.isna().sum())      

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64


In [5]:
df["Sulfate"].fillna(df["Sulfate"].mean(), inplace=True)
df["ph"].fillna(df["ph"].mean(), inplace=True)
df["Trihalomethanes"].fillna(df["Trihalomethanes"].mean(), inplace=True)

In [6]:
print(df.isna().sum())      

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64


In [66]:
for i, row in df.iterrows():
    if df.at[i,'Potability'] == True:
        a = random.choice([0,1])
        if a==0:
            df.drop(i, inplace=True)

print(Counter(df["Potability"]))
df["Potability"].replace({False: 0, True: 1}, inplace=True)

Counter({0: 1998, 1: 324})


In [125]:
(Counter(df['Potability'])[1]/Counter(df['Potability'])[0])*100

16.216216216216218

In [67]:
X = df.drop(columns=["Potability"])
y = df.Potability
print(type(y[0]))
print(y[0])

<class 'numpy.int64'>
0


In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
print(y_test.value_counts())
print(y_train.value_counts())

0    398
1     67
Name: Potability, dtype: int64
0    1600
1     257
Name: Potability, dtype: int64


## Data Resampling Techniques

Random Oversampling

In [70]:
oversampler = RandomOverSampler(sampling_strategy=.5)
X_over, y_over = oversampler.fit_resample(X_train, y_train)
print(Counter(y_over))

Counter({0: 1600, 1: 800})


Random Undersampling

In [71]:
undersampler = RandomUnderSampler(sampling_strategy=.5)
X_under, y_under = undersampler.fit_resample(X_train, y_train)
print(Counter(y_under))

Counter({0: 514, 1: 257})


Synthetic Minority Over Sampling

In [72]:
X_train_smote = np.asarray(X_train)
y_train_smote = np.asarray(y_train)


In [73]:
smotesampler = SMOTE()
X_smote, y_smote = smotesampler.fit_resample(X_train_smote, y_train_smote)
print(Counter(y_smote))

Counter({0: 1600, 1: 1600})


## Decision Tree

In [74]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [75]:
pred = clf.predict(X_test)

In [76]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[344  54]
 [ 52  15]]


In [77]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_dec = math.sqrt(true_positive_rate*true_negative_rate)

In [78]:
f1_dec = f1_score(y_test, pred)

In [79]:
print("g-mean = ",g_mean_dec)
print("F1-score = ",f1_dec)

g-mean =  0.43989184764091793
F1-score =  0.22058823529411764


## Bagging 

In [80]:
bag = BaggingClassifier()
bag.fit(X_train, y_train)

BaggingClassifier()

In [81]:
pred = bag.predict(X_test)

In [82]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[389   9]
 [ 62   5]]


In [83]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_bag = math.sqrt(true_positive_rate*true_negative_rate)

In [84]:
f1_bag = f1_score(y_test, pred)

In [85]:
print("g-mean = ",g_mean_bag)
print("F1-score = ",f1_bag)

g-mean =  0.2700728114473707
F1-score =  0.12345679012345678


## AdaBoost

In [86]:
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)

AdaBoostClassifier()

In [87]:
pred = ada.predict(X_test)

In [88]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[392   6]
 [ 66   1]]


In [89]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ada = math.sqrt(true_positive_rate*true_negative_rate)

In [90]:
f1_ada = f1_score(y_test, pred)

In [91]:
print("g-mean = ",g_mean_ada)
print("F1-score = ",f1_ada)

g-mean =  0.12124507210269508
F1-score =  0.02702702702702703


## Easy Ensemble

In [92]:
easy_ensemble = EasyEnsembleClassifier(random_state=42)
easy_ensemble.fit(X_smote, y_smote)

EasyEnsembleClassifier(random_state=42)

In [93]:
pred = easy_ensemble.predict(X_test)

In [94]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[246 152]
 [ 39  28]]


In [95]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ez = math.sqrt(true_positive_rate*true_negative_rate)

In [96]:
f1_ez = f1_score(y_test, pred)

In [97]:
print("g-mean = ",g_mean_ez)
print("F1-score = ",f1_ez)

g-mean =  0.5082385834049217
F1-score =  0.22672064777327933


## RusBoost

In [98]:
rus = AdaBoostClassifier()
rus.fit(X_under, y_under)

AdaBoostClassifier()

In [99]:
pred = rus.predict(X_test)

In [100]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[324  74]
 [ 51  16]]


In [101]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_rus = math.sqrt(true_positive_rate*true_negative_rate)

In [102]:
f1_rus = f1_score(y_test, pred)

In [103]:
print("g-mean = ",g_mean_rus)
print("F1-score = ",f1_rus)

g-mean =  0.4409136651562333
F1-score =  0.20382165605095542


## Smote Boost

In [104]:
smote_boost = AdaBoostClassifier()
smote_boost.fit(X_smote,y_smote)

AdaBoostClassifier()

In [105]:
pred = smote_boost.predict(X_test)

In [106]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[246 152]
 [ 39  28]]


In [107]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_smb = math.sqrt(true_positive_rate*true_negative_rate)

In [108]:
f1_smb = f1_score(y_test, pred)

In [109]:
print("g-mean = ",g_mean_smb)
print("F1-score = ",f1_smb)

g-mean =  0.5082385834049217
F1-score =  0.22672064777327933


## Under Bagging

In [110]:
under_bagging = BaggingClassifier()
under_bagging.fit(X_under, y_under)

BaggingClassifier()

In [111]:
pred = under_bagging.predict(X_test)

In [112]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[351  47]
 [ 57  10]]


In [113]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_ubag = math.sqrt(true_positive_rate*true_negative_rate)

In [114]:
f1_ubag = f1_score(y_test, pred)

In [115]:
print("g-mean = ",g_mean_ubag)
print("F1-score = ",f1_ubag)

g-mean =  0.36280613377845156
F1-score =  0.16129032258064516


## Over Bagging

In [116]:
over_bagging = BaggingClassifier()
over_bagging.fit(X_over, y_over)

BaggingClassifier()

In [117]:
pred = over_bagging.predict(X_test)

In [118]:
conf = confusion_matrix(y_test, pred)
print(conf)

[[385  13]
 [ 61   6]]


In [119]:
true_negative_rate = conf[0][0]/(conf[0][0]+conf[0][1])
true_positive_rate = conf[1][1]/(conf[1][1]+conf[1][0])
g_mean_obag = math.sqrt(true_positive_rate*true_negative_rate)

In [120]:
f1_obag = f1_score(y_test, pred)

In [121]:
print("g-mean = ",g_mean_obag)
print("F1-score = ",f1_obag)

g-mean =  0.29432493213987493
F1-score =  0.13953488372093023


## Results

In [122]:
models = ["Decision Tree","Bagging", "AdaBoost", "Easy Ensemble","RusBoost","Smote Boost","Under Bagging", "Over Bagging"]
g_mean_score = [g_mean_dec,g_mean_bag,g_mean_ada,g_mean_ez,g_mean_rus,g_mean_smb,g_mean_ubag,g_mean_obag]
F1_score = [f1_dec,f1_bag,f1_ada,f1_ez,f1_rus,f1_smb,f1_ubag,f1_obag]

In [123]:
result_data = {
    'Models': models,
    'g-mean': g_mean_score,
    'F1-score': F1_score
}
result_df = pd.DataFrame(result_data)

In [124]:
result_df

Unnamed: 0,Models,g-mean,F1-score
0,Decision Tree,0.439892,0.220588
1,Bagging,0.270073,0.123457
2,AdaBoost,0.121245,0.027027
3,Easy Ensemble,0.508239,0.226721
4,RusBoost,0.440914,0.203822
5,Smote Boost,0.508239,0.226721
6,Under Bagging,0.362806,0.16129
7,Over Bagging,0.294325,0.139535
