In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.read_csv('water_quality_categorized.csv')
df2 = pd.read_csv('water_quality_normalized.csv')

print(df1.info())
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2538 entries, 0 to 2537
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2538 non-null   float64
 1   Hardness         2538 non-null   float64
 2   Solids           2538 non-null   float64
 3   Chloramines      2538 non-null   float64
 4   Sulfate          2538 non-null   float64
 5   Conductivity     2538 non-null   float64
 6   Organic_carbon   2538 non-null   float64
 7   Trihalomethanes  2538 non-null   float64
 8   Turbidity        2538 non-null   float64
 9   Potability       2538 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 198.4 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2538 entries, 0 to 2537
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2538 non-null   float64
 1   Hardness         2538 non-null   f

In [3]:
df1_features = df1.loc[:, df1.columns != 'Potability']
df1_label = df1.Potability

df2_features = df2.loc[:, df2.columns != 'Potability']
df2_label = df2.Potability

In [4]:
from sklearn.model_selection import train_test_split

df1_train_X, df1_test_X, df1_train_Y, df1_test_Y = train_test_split(df1_features, df1_label, test_size=0.2, random_state=1234)
df2_train_X, df2_test_X, df2_train_Y, df2_test_Y = train_test_split(df2_features, df2_label, test_size=0.2, random_state=1234)

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

DCT_1 = DecisionTreeClassifier() # categorized
DCT_1.fit(df1_train_X, df1_train_Y)

DCT_2 = DecisionTreeClassifier() # uncategorized
DCT_2.fit(df2_train_X, df2_train_Y)

DCT1_results = DCT_1.predict(df1_test_X)
DCT2_results = DCT_2.predict(df2_test_X)

cfm_1 = metrics.confusion_matrix(df1_test_Y, DCT1_results)
cfm_2 = metrics.confusion_matrix(df2_test_Y, DCT2_results)

accuracy_1 = metrics.accuracy_score(df1_test_Y, DCT1_results)
accuracy_2 = metrics.accuracy_score(df2_test_Y, DCT2_results)

precision_1 = metrics.precision_score(df1_test_Y, DCT1_results)
precision_2  = metrics.precision_score(df1_test_Y, DCT1_results)

recall_1 = metrics.recall_score(df1_test_Y, DCT1_results)
recall_2 = metrics.recall_score(df1_test_Y, DCT1_results)

print("Confusion matrix for categorized: {}".format(cfm_1))
print("Accuracy : {}, precision : {}, recall : {}".format(accuracy_1, precision_1, recall_1))
print()
print("Confusion matrix for uncategorized: {}".format(cfm_2))
print("Accuracy : {}, precision : {}, recall : {}".format(accuracy_2, precision_2, recall_2))

Confusion matrix for categorized: [[161 105]
 [ 92 150]]
Accuracy : 0.6122047244094488, precision : 0.5882352941176471, recall : 0.6198347107438017

Confusion matrix for uncategorized: [[165 101]
 [ 87 155]]
Accuracy : 0.6299212598425197, precision : 0.5882352941176471, recall : 0.6198347107438017


In [7]:
from sklearn.ensemble import RandomForestClassifier

RFC_1 = RandomForestClassifier() # categorized
RFC_1.fit(df1_train_X, df1_train_Y)

RFC_2 = RandomForestClassifier() # uncategorized
RFC_2.fit(df2_train_X, df2_train_Y)

RFC1_results = RFC_1.predict(df1_test_X)
RFC2_results = RFC_2.predict(df2_test_X)

cfm_1_RFC = metrics.confusion_matrix(df1_test_Y, RFC1_results)
cfm_2_RFC = metrics.confusion_matrix(df2_test_Y, RFC2_results)

accuracy1_RFC = metrics.accuracy_score(df1_test_Y, RFC1_results)
accuracy2_RFC = metrics.accuracy_score(df2_test_Y, RFC2_results)

precision1_RFC = metrics.precision_score(df1_test_Y, RFC1_results)
precision2_RFC = metrics.precision_score(df2_test_Y, RFC2_results)

recall1_RFC = metrics.recall_score(df1_test_Y, RFC1_results)
recall2_RFC = metrics.recall_score(df2_test_Y, RFC2_results)

print("Confusion matrix for categorized: {}".format(cfm_1_RFC))
print("Accuracy : {}, precision : {}, recall : {}".format(accuracy1_RFC, precision1_RFC, recall1_RFC))
print()
print("Confusion matrix for uncategorized: {}".format(cfm_2_RFC))
print("Accuracy : {}, precision : {}, recall : {}".format(accuracy2_RFC, precision2_RFC, recall2_RFC))

Confusion matrix for categorized: [[169  97]
 [ 86 156]]
Accuracy : 0.639763779527559, precision : 0.616600790513834, recall : 0.6446280991735537

Confusion matrix for uncategorized: [[185  81]
 [ 67 175]]
Accuracy : 0.7086614173228346, precision : 0.68359375, recall : 0.7231404958677686


In [9]:
from sklearn.ensemble import GradientBoostingClassifier

GBC1 = GradientBoostingClassifier() # categorized
GBC1.fit(df1_train_X, df1_train_Y)

GBC2 = GradientBoostingClassifier()  # uncategorized
GBC2.fit(df2_train_X, df2_train_Y)

GBC1_results = GBC1.predict(df1_test_X)
GBC2_results = GBC2.predict(df2_test_X)

cfm1_GBC = metrics.confusion_matrix(df1_test_Y, GBC1_results)
cfm2_GBC = metrics.confusion_matrix(df1_test_Y, GBC1_results)

accuracy1_GBC = metrics.accuracy_score(df1_test_Y, GBC1_results) 
accuracy2_GBC = metrics.accuracy_score(df2_test_Y, GBC2_results)

precision1_GBC = metrics.precision_score(df1_test_Y, GBC1_results)
precision2_GBC = metrics.precision_score(df2_test_Y, GBC2_results)

recall1_GBC = metrics.recall_score(df1_test_Y, GBC1_results)
recall2_GBC = metrics.recall_score(df2_test_Y, GBC2_results)

print("Confusion matrix for categorized: {}".format(cfm1_GBC))
print("Accuracy : {}, precision : {}, recall : {}".format(accuracy1_GBC, precision1_GBC, recall1_GBC))
print()
print("Confusion matrix for uncategorized: {}".format(cfm2_GBC))
print("Accuracy : {}, precision : {}, recall : {}".format(accuracy2_GBC, precision2_GBC, recall2_GBC))

Confusion matrix for categorized: [[163 103]
 [ 84 158]]
Accuracy : 0.6318897637795275, precision : 0.6053639846743295, recall : 0.6528925619834711

Confusion matrix for uncategorized: [[163 103]
 [ 84 158]]
Accuracy : 0.6496062992125984, precision : 0.6203007518796992, recall : 0.6818181818181818


In [10]:
import pickle

model_name = 'water_quality_best_model.pkl'
pickle.dump(RFC_2, open(model_name, 'wb'))

In [12]:
load_model = pickle.load(open(model_name, 'rb'))
test_result = load_model.score(df2_test_X, df2_test_Y)
print(test_result)

0.7086614173228346
