In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

df = pd.read_csv('../Dataset/water_potability.csv')

df.dropna(inplace=True)

Y = df['Potability']
X = df.drop('Potability', axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 90)

In [204]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from joblib import dump

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('forest', RandomForestClassifier(n_estimators = 112, criterion = 'gini', max_depth = 78, min_samples_split = 13, bootstrap = True))
])

pipe.fit(X_train, Y_train)

dump(pipe, 'WaterPotModel.pkl')

predictTrainGrid = pipe.predict(X_train)
predictTestGrid = pipe.predict(X_test)

print(f"Precision train: {precision_score(Y_train, predictTrainGrid)}")
print(f"Precision test: {precision_score(Y_test, predictTestGrid)}\n")

print(f"Accuracy train: {accuracy_score(Y_train, predictTrainGrid)}")
print(f"Accuracy test: {accuracy_score(Y_test, predictTestGrid)}\n")

print(f"Recall train: {recall_score(Y_train, predictTrainGrid, average = 'micro')}")
print(f"Recall test: {recall_score(Y_test, predictTestGrid, average = 'micro')}\n")

print(f"Matrix train: \n{confusion_matrix(Y_train, predictTrainGrid)}")
print(f"Matrix test: \n{confusion_matrix(Y_test, predictTestGrid)}\n")

print("\nRelatório de Classificação (Treino)")
print(classification_report(Y_train, predictTrainGrid))

print("\nRelatório de Classificação (Teste)")
print(classification_report(Y_test, predictTestGrid))

Precision train: 0.9893778452200304
Precision test: 0.7

Accuracy train: 0.9736688121708601
Accuracy test: 0.6920529801324503

Recall train: 0.9736688121708601
Recall test: 0.6920529801324503

Matrix train: 
[[1012    7]
 [  38  652]]
Matrix test: 
[[160  21]
 [ 72  49]]


Relatório de Classificação (Treino)
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1019
           1       0.99      0.94      0.97       690

    accuracy                           0.97      1709
   macro avg       0.98      0.97      0.97      1709
weighted avg       0.97      0.97      0.97      1709


Relatório de Classificação (Teste)
              precision    recall  f1-score   support

           0       0.69      0.88      0.77       181
           1       0.70      0.40      0.51       121

    accuracy                           0.69       302
   macro avg       0.69      0.64      0.64       302
weighted avg       0.69      0.69      0.67       302

