In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

df = pd.read_csv('../Dataset/waterQuality1.csv')

df = df.replace("#NUM!", np.nan)
df.dropna(inplace = True)

Y = df['is_safe']
X = df.drop('is_safe', axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 42)


In [None]:
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('forest', RandomForestClassifier())
])

parameters = {
    'forest__n_estimators': [70, 75, 80, 85],
    'forest__criterion': ['entropy'],
    'forest__max_depth': [2200, 2250, 2300, 2350, 2400],
    'forest__min_samples_split': [2, 3, 4, 8, 10, 12]
}

gridSearch = GridSearchCV(
    pipe, 
    parameters, 
    cv = 5, 
    n_jobs = 8
)

gridSearch.fit(X_train, Y_train)

predictTrainGrid = gridSearch.predict(X_train)
predictTestGrid = gridSearch.predict(X_test)

print(f"Accuracy train: {accuracy_score(Y_train, predictTrainGrid)}")
print(f"Accuracy test: {accuracy_score(Y_test, predictTestGrid)}\n")

print(f"Recall train: {recall_score(Y_train, predictTrainGrid, average = 'micro')}")
print(f"Recall test: {recall_score(Y_test, predictTestGrid, average = 'micro')}\n")

print(f"Matrix train: \n{confusion_matrix(Y_train, predictTrainGrid)}")
print(f"Matrix test: \n{confusion_matrix(Y_test, predictTestGrid)}\n")

print("\nRelatório de Classificação (Treino)")
print(classification_report(Y_train, predictTrainGrid))

print("\nRelatório de Classificação (Teste)")
print(classification_report(Y_test, predictTestGrid))

print(f"Best estimator grid: {gridSearch.best_estimator_}")
print(f"Best params grid: {gridSearch.best_params_}")

Accuracy train: 1.0
Accuracy test: 0.9666666666666667

Recall train: 1.0
Recall test: 0.9666666666666667

Matrix train: 
[[6037    0]
 [   0  759]]
Matrix test: 
[[1043    4]
 [  36  117]]


Relatório de Classificação (Treino)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6037
           1       1.00      1.00      1.00       759

    accuracy                           1.00      6796
   macro avg       1.00      1.00      1.00      6796
weighted avg       1.00      1.00      1.00      6796


Relatório de Classificação (Teste)
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1047
           1       0.97      0.76      0.85       153

    accuracy                           0.97      1200
   macro avg       0.97      0.88      0.92      1200
weighted avg       0.97      0.97      0.96      1200

Best estimator grid: Pipeline(steps=[('scaler', StandardScaler()),
                