In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Loading data
data_path = '/content/brisbane_water_quality.csv'
data = pd.read_csv(data_path)

# Separate numeric and non-numeric columns, handling missing values in numeric columns
numeric_data = data.select_dtypes(include=[np.number])
non_numeric_data = data.select_dtypes(exclude=[np.number])

numeric_data.fillna(numeric_data.mean(), inplace=True)
data_cleaned = pd.concat([numeric_data, non_numeric_data], axis=1)

# Drop columns that are not predictive
data_cleaned = data_cleaned.drop(columns=['Timestamp', 'Record number'] + [col for col in data_cleaned.columns if '[quality]' in col])

# Create a binary target for contamination based on 'Turbidity' threshold (example: > 2 as contaminated)
target = 'Turbidity'
data_cleaned['contaminant_level'] = (data_cleaned[target] > 2).astype(int)

# Split into features and target
X = data_cleaned.drop(columns=[target, 'contaminant_level'])
y = data_cleaned['contaminant_level']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train a RandomForestClassifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Model evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

# Function to interpret and provide early warnings
def provide_warning(turbidity_value, is_contaminated):
    if is_contaminated:
        return f"Warning: High Turbidity detected ({turbidity_value}). Potential contaminants present. Consider treatment options (e.g., filtration, boiling) before use."
    else:
        return f"Safe: Turbidity within acceptable range ({turbidity_value}). No immediate treatment necessary."

# Applying the warning function to the test set
X_test_with_turbidity = X_test.copy()
X_test_with_turbidity['Turbidity'] = data_cleaned[target].iloc[X_test.index]
X_test_with_turbidity['Prediction'] = y_pred

# Generate warnings for each prediction
warnings = [
    provide_warning(row['Turbidity'], row['Prediction'])
    for _, row in X_test_with_turbidity.iterrows()
]

# Displaying results
print("Model Evaluation Results:\n")
print(f"Accuracy: {accuracy:.2f}\n")

print("Classification Report:")
print(classification_rep)

print("Confusion Matrix:")
print(confusion_mat)

print("\nAll Warnings:")
for i, warning in enumerate(warnings):
    print(f"{i + 1}. {warning}")




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
4284. Safe: Turbidity within acceptable range (0.754). No immediate treatment necessary.
4289. Safe: Turbidity within acceptable range (2.475). No immediate treatment necessary.
4290. Safe: Turbidity within acceptable range (1.883). No immediate treatment necessary.
4292. Safe: Turbidity within acceptable range (3.941687547587734). No immediate treatment necessary.
4297. Safe: Turbidity within acceptable range (1.821). No immediate treatment necessary.
4303. Safe: Turbidity within acceptable range (1.868). No immediate treatment necessary.
4308. Safe: Turbidity within acceptable range (0.964). No immediate treatment necessary.
4309. Safe: Turbidity within acceptable range (2.01). No immediate treatment necessary.
4316. Safe: Turbidity within acceptable range (1.562). No immediate treatment necessary.
4323. Safe: Turbidity within acceptable range (2.051). No immediate treatment necessary.
4337. Safe: Turbidity within accep