# 🌊 Water Quality Prediction Jupyter Notebook

In [None]:

# Install required packages (uncomment if needed)
# !pip install pandas numpy matplotlib seaborn scikit-learn


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:

# Load dataset
df = pd.read_csv('/mnt/data/afa2e701598d20110228.csv')
df.head()


In [None]:

# Convert date column to datetime format if available
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month


In [None]:

# Check for nulls and column names
print(df.columns)
print(df.isnull().sum())


In [None]:

# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.select_dtypes(include=[np.number]).corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:

# Feature and target selection
features = ['NH4', 'BSK5', 'Suspended', 'O2', 'NO3', 'NO2']
targets = ['SO4', 'PO4', 'CL']

# Drop rows with NA in important columns
df_model = df[features + targets].dropna()

X = df_model[features]
y = df_model[targets]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X_train, y_train)

# Prediction and evaluation
y_pred = model.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("MSE:", mean_squared_error(y_test, y_pred))
