In [None]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from fpdf import FPDF


In [None]:

# Load dataset
file_path = 'water_potability-updated.csv'  # Replace with your file path if needed
dataset = pd.read_csv(file_path)

# Handle missing values and data types
dataset = dataset.apply(pd.to_numeric, errors='coerce')
dataset.fillna(dataset.mean(), inplace=True)


In [None]:

# Add synthetic features
dataset['Water_Hardness_Ratio'] = dataset['Hardness'] / dataset['Conductivity']
dataset['Chloramine_Turbidity_Ratio'] = dataset['Chloramines'] / dataset['Turbidity']

# Define features and target variable
X = dataset.drop('Potability', axis=1)
y = dataset['Potability']


In [None]:

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train a Random Forest model
model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)


In [None]:

# Evaluate the model
y_pred = model.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print("\nAccuracy Score:", accuracy)


In [None]:

def predict_potability():
    print("Enter the following water quality parameters:")
    user_data = {
        "ph": float(input("pH: ")),
        "Hardness": float(input("Hardness: ")),
        "Solids": float(input("Solids: ")),
        "Chloramines": float(input("Chloramines: ")),
        "Sulfate": float(input("Sulfate: ")),
        "Conductivity": float(input("Conductivity: ")),
        "Organic_carbon": float(input("Organic Carbon: ")),
        "Trihalomethanes": float(input("Trihalomethanes: ")),
        "Turbidity": float(input("Turbidity: ")),
    }
    user_data["Water_Hardness_Ratio"] = user_data["Hardness"] / user_data["Conductivity"]
    user_data["Chloramine_Turbidity_Ratio"] = user_data["Chloramines"] / user_data["Turbidity"]

    # Prepare input for prediction
    user_df = pd.DataFrame([user_data])
    prediction = model.predict(user_df)[0]

    print("\nPrediction: Water is", "Potable" if prediction == 1 else "Not Potable")


In [None]:

class PDFReport(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Water Potability Prediction Model Report', 0, 1, 'C')
        self.ln(10)

    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

    def add_content(self, title, content):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, title, 0, 1)
        self.ln(5)
        self.set_font('Arial', '', 10)
        self.multi_cell(0, 10, content)
        self.ln(10)


pdf = PDFReport()
pdf.add_page()
pdf.add_content("Model Evaluation Results",
                f"Confusion Matrix:\n{conf_matrix}\n\n"
                f"Classification Report:\n{class_report}\n\n"
                f"Accuracy Score: {accuracy:.2f}")
pdf.add_content("Added Features",
                "- Water Hardness Ratio: Hardness / Conductivity\n"
                "- Chloramine Turbidity Ratio: Chloramines / Turbidity")
pdf.add_content("How to Use the Model",
                "You can input water quality parameters to predict whether the water is potable or not. "
                "The model has been trained with high accuracy and considers multiple features influencing water quality.")
output_path = "Water_Potability_Model_Report.pdf"
pdf.output(output_path)
print(f"PDF report generated and saved as {output_path}")
