In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("/content/BKB_WaterQualityData_2020084.csv")

In [None]:
df.head()

In [None]:
df1 = df.drop(columns=["Secchi Depth (m)", "Air Temp-Celsius", "Air Temp (?F)", "AirTemp (C)", "Unit_Id"])

In [None]:
df1.columns

In [None]:
df1.describe()

In [None]:
df1.head()

In [None]:
def classify_pollution(row):
    # Classify Dissolved Oxygen (DO) based on WHO standards
    if row['Dissolved Oxygen (mg/L)'] < 4.0:
        row['DO_Category'] = 'High Pollution'
    elif row['Dissolved Oxygen (mg/L)'] <= 6.0:
        row['DO_Category'] = 'Moderate Pollution'
    else:
        row['DO_Category'] = 'Low Pollution'

    if row['pH (standard units)'] < 6.0:
        row['pH_Category'] = 'High Pollution'
    elif row['pH (standard units)'] <= 8.5:
        row['pH_Category'] = 'Low Pollution'
    else:
        row['pH_Category'] = 'High Pollution'

    if row['Water Temp (?C)'] > 30:
        row['Temp_Category'] = 'High Pollution'
    elif row['Water Temp (?C)'] >= 10:
        row['Temp_Category'] = 'Low Pollution'
    else:
        row['Temp_Category'] = 'Low Pollution'

    # Classify Salinity based on WHO standards
    if row['Salinity (ppt)'] > 10:
        row['Salinity_Category'] = 'High Pollution'
    elif row['Salinity (ppt)'] > 1:
        row['Salinity_Category'] = 'Moderate Pollution'
    else:
        row['Salinity_Category'] = 'Low Pollution'

    return row

In [None]:
# Apply the classify_pollution function to each row in the dataframe
df1 = df1.apply(classify_pollution, axis=1)

# Check the new columns to verify the categorization
df1[['Dissolved Oxygen (mg/L)', 'DO_Category', 'pH (standard units)', 'pH_Category', 'Water Temp (?C)', 'Temp_Category', 'Salinity (ppt)', 'Salinity_Category']].head()

In [None]:
# Define the assign_overall_pollution function
def assign_overall_pollution(row):
    # Count the number of 'High Pollution' categories
    high_pollution_count = sum([row['DO_Category'] == 'High Pollution',
                                 row['pH_Category'] == 'High Pollution',
                                 row['Temp_Category'] == 'High Pollution',
                                 row['Salinity_Category'] == 'High Pollution'])

    # Assign overall pollution category based on count
    if high_pollution_count >= 3:
        row['Overall_Pollution'] = 'High'
    elif high_pollution_count >= 1:
        row['Overall_Pollution'] = 'Moderate'
    else:
        row['Overall_Pollution'] = 'Low'

    return row

# Apply the classify_pollution function to each row in the dataframe
df1 = df1.apply(classify_pollution, axis=1)

# Apply the function to assign an overall pollution category
df1 = df1.apply(assign_overall_pollution, axis=1) # Changed df to df1

# Check the final result with overall pollution column
df1[['DO_Category', 'pH_Category', 'Temp_Category', 'Salinity_Category', 'Overall_Pollution']].head()


In [None]:
df1.head()

In [None]:
df1.isnull().sum()

In [None]:
df1 = df.drop(columns=["Secchi Depth (m)", "Air Temp-Celsius", "Air Temp (?F)", "AirTemp (C)", "Unit_Id"])

In [None]:
df1.head()

In [None]:
df1 = df.drop(columns=["Field_Tech", "DateVerified", "WhoVerified", "Field_Tech", "Secchi Depth (m)", "Air Temp-Celsius", "Air Temp (?F)", "AirTemp (C)", "Unit_Id"])

In [None]:
df1.head()

In [None]:
df1.isnull().sum()

In [None]:
# Calculate the mean of the Salinity column, excluding NaN values
mean_salinity = df1['Salinity (ppt)'].mean()

# Print the mean value
print(f"Mean Salinity: {mean_salinity}")

In [None]:
# Fill the missing values in the Salinity column with the mean value
df1['Salinity (ppt)'] = df1['Salinity (ppt)'].fillna(df1['Salinity (ppt)'].mean())

# Check if missing values are filled
df1['Salinity (ppt)'].isnull().sum()


In [None]:
df1[["Salinity (ppt)"]].head()

In [None]:
# Replace 'Dissolved Oxygen (Mg/L)' with 'Dissolved Oxygen (mg/L)'
df1['Dissolved Oxygen (mg/L)'] = df1['Dissolved Oxygen (mg/L)'].fillna(df1['Dissolved Oxygen (mg/L)'].mean())

# Check if missing values are filled
df1['Dissolved Oxygen (mg/L)'].isnull().sum()

In [None]:
df1['pH (standard units)'] = df1['pH (standard units)'].fillna(df1['pH (standard units)'].mean())

# Check if missing values are filled
df1['pH (standard units)'].isnull().sum()

In [None]:
df1['Water Depth (m)'] = df1['Water Depth (m)'].fillna(df1['Water Depth (m)'].mean())

# Check if missing values are filled
df1['Water Depth (m)'].isnull().sum()

In [None]:
df1['Water Temp (?C)'] = df1['Water Temp (?C)'].fillna(df1['Water Temp (?C)'].mean())

# Check if missing values are filled
df1['Water Temp (?C)'].isnull().sum()

In [None]:
# Convert 'Time (24:00)' column to datetime objects
df1['Time (24:00)'] = pd.to_datetime(df1['Time (24:00)'], errors='coerce').dt.time

# Fill NaN values with the mode (most frequent time)
# Calculating the mean for time data is not meaningful.
mode_time = df1['Time (24:00)'].mode()[0]
df1['Time (24:00)'] = df1['Time (24:00)'].fillna(mode_time)

# Check if missing values are filled
print(df1['Time (24:00)'].isnull().sum())

In [None]:
# Convert 'Read_Date' column to datetime objects, handling errors
df1['Read_Date'] = pd.to_datetime(df1['Read_Date'], errors='coerce')

# Now you can fill NaN values with the mean (if appropriate for your data)
# If you want to fill with a specific date, use pd.Timestamp('your_date') instead of mean
# For example, to fill with today's date:
# df1['Read_Date'] = df1['Read_Date'].fillna(pd.Timestamp.today())

# Alternatively, to fill with the most frequent date:
mode_date = df1['Read_Date'].mode()[0]  # Get the most frequent date
df1['Read_Date'] = df1['Read_Date'].fillna(mode_date)

# Check if missing values are filled
df1['Read_Date'].isnull().sum()

In [None]:
# Convert 'Site_Id' column to numeric, handling errors by coercing non-numeric values to NaN
df1['Site_Id'] = pd.to_numeric(df1['Site_Id'], errors='coerce')

# Now you can fill NaN values with the mode
# Check if mode is empty before accessing it
modes = df1['Site_Id'].mode()
if not modes.empty:
    mode_site_id = modes[0]
    df1['Site_Id'] = df1['Site_Id'].fillna(mode_site_id)
else:
    # Handle the case when mode is empty, e.g., fill with a default value
    df1['Site_Id'] = df1['Site_Id'].fillna(-1)  # Replace -1 with your desired default value

# Check if missing values are filled
df1['Site_Id'].isnull().sum()

In [None]:
df1.isnull().sum()

In [None]:
# Define thresholds based on WHO/EPA standards
df1['Contamination_Risk'] = (
    (df1['pH (standard units)'] < 6.5) | (df1['pH (standard units)'] > 8.5) |
    (df1['Dissolved Oxygen (mg/L)'] < 5.0) |
    (df1['Salinity (ppt)'] > 35) |
    (df1['Water Temp (?C)'] > 30)
).astype(int)  # 1 = High risk, 0 = Low risk

# Optional: Check class balance
print(df1['Contamination_Risk'].value_counts())

# Save the labeled dataset
df1.to_csv('labeled_water_data.csv', index=False)
print("✅ 'Contamination_Risk' column created and saved successfully.")


In [None]:
df1.head()

In [None]:
df2 = df1.drop(columns=["Site_Id", "Read_Date", "Time (24:00)", "Year"], axis=1)

In [None]:
df2.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
X = df2.drop('Contamination_Risk', axis=1)
y = df2['Contamination_Risk']

In [None]:
X.head()

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

In [None]:
print("🔍 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import accuracy_score

# Get the accuracy
accuracy = accuracy_score(y_test, y_pred)

print(f"✅ Model Accuracy Score: {accuracy:.4f}")

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score # Import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np # Import numpy for np.mean and np.std

model = RandomForestClassifier(n_estimators=100, random_state=42)

# Perform 5-Fold Cross-Validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

# Print Cross-Validation Results
print(f"Cross-Validation Scores (5-fold): {cv_scores}")
print(f"Mean Accuracy: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation of Accuracy: {np.std(cv_scores):.4f}")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save trained model
joblib.dump(model, 'water_risk_model.pkl')


In [None]:
!pip install gradio

In [None]:
import gradio as gr
import numpy as np
import joblib

model = joblib.load("water_risk_model.pkl")

def predict_water_risk(salinity, do, ph, depth, temp):
    features = np.array([[salinity, do, ph, depth, temp]])
    prediction = model.predict(features)[0]
    return "⚠️ Water Pollution Risk" if prediction == 1 else "✅ Water is Safe"

with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("## 💧 Water Safety Prediction App")
    gr.Markdown("Enter water parameters below to check if the water is safe.")

    with gr.Row():
        salinity = gr.Number(label="Salinity (mg/L)", value=1.0)
        do = gr.Number(label="Dissolved Oxygen (mg/L)", value=8.0)
        ph = gr.Number(label="pH", value=7.0)
        depth = gr.Number(label="Water Depth (m)", value=0.5)
        temp = gr.Number(label="Temperature (°C)", value=25.0)

    submit = gr.Button("Predict")
    result = gr.Textbox(label="Prediction Result")

    submit.click(
        predict_water_risk,
        inputs=[salinity, do, ph, depth, temp],
        outputs=result
    )

app.launch()


In [46]:
import gradio as gr
import numpy as np
import pandas as pd
import os
import joblib
from datetime import datetime

# Load your trained model
model = joblib.load("water_risk_model.pkl")  # Make sure this file is in the same folder

# Prediction function
def predict_water_risk(username, salinity, do, ph, depth, temp):
    try:
        features = np.array([[salinity, do, ph, depth, temp]])
        prediction = model.predict(features)[0]
        result = "⚠️ Water Pollution Risk" if prediction == 1 else "✅ Water is Safe"

        # Save history
        history_path = f"{username}_history.csv"
        new_entry = pd.DataFrame([{
            "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "Salinity": salinity,
            "DO": do,
            "pH": ph,
            "Depth": depth,
            "Temp": temp,
            "Prediction": result
        }])

        if os.path.exists(history_path):
            old_data = pd.read_csv(history_path)
            updated_data = pd.concat([old_data, new_entry], ignore_index=True)
        else:
            updated_data = new_entry

        updated_data.to_csv(history_path, index=False)

        return result, updated_data

    except Exception as e:
        return f"❌ Error: {str(e)}", pd.DataFrame()

# Define interface
with gr.Blocks() as demo:
    gr.Markdown("# 🌊 Water Safety Predictor App")
    gr.Markdown("Enter your details to check water safety and track your prediction history.")

    with gr.Row():
        username = gr.Textbox(label="Username", placeholder="Enter your name or ID")

    with gr.Row():
        salinity = gr.Number(label="Salinity")
        do = gr.Number(label="Dissolved Oxygen")
        ph = gr.Number(label="pH")
        depth = gr.Number(label="Water Depth")
        temp = gr.Number(label="Water Temperature")

    predict_btn = gr.Button("Predict")
    output_label = gr.Textbox(label="Prediction")
    history_table = gr.Dataframe(label="Your Prediction History")

    predict_btn.click(
        fn=predict_water_risk,
        inputs=[username, salinity, do, ph, depth, temp],
        outputs=[output_label, history_table]
    )

# Launch the app (use share=True to get a public link)
demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6900435e9d16f2c978.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


