In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the CSV
df = pd.read_csv("datasets/water_quality.csv")

# Select features and label
X = df[["Ph","Turbidity","Temperature"]]  # or use raw R, G, B
y = df["Label"]

# Encode label
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42
)

# Train classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Detailed classification report
print("\nClassification Report:")
# print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.99

Classification Report:

Confusion Matrix:
[[1305   26]
 [   0 1219]]


In [3]:
import joblib
joblib.dump(model, "water_quality_model.pkl")
joblib.dump(le, "water_quality_label_encoder.pkl")


['water_quality_label_encoder.pkl']

In [None]:
model = joblib.load("color_tag_model.pkl")
le = joblib.load("label_encoder.pkl")