In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import pickle

# Load dataset
data = pd.read_csv('../data/water_data.csv')

# Handle missing values
data.fillna(data.mean(), inplace=True)

# Features and label
X = data.drop('Potability', axis=1)
y = data['Potability']

# Scale data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train RandomForest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save model & scaler
pickle.dump(model, open('../models/water_model.pkl', 'wb'))
pickle.dump(scaler, open('../models/water_scaler.pkl', 'wb'))

print("✅ Water Pollution Model Trained and Saved!")


✅ Water Pollution Model Trained and Saved!
