In [None]:
# ====================================================================
# 0. Import Libraries and Load Data
# ====================================================================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

warnings.filterwarnings("ignore")

# Load the dataset
try:
    df = pd.read_csv("C:/Users/Fernado/Desktop/ML_Project/data/raw/water_quality.csv")
    print("✅ Dataset loaded successfully.")
except FileNotFoundError:
    print("❌ Error: 'data/raw/water_quality.csv' not found.")
    df = pd.DataFrame()

❌ Error: 'data/raw/water_quality.csv' not found.


## Phase 2: Data Preparation & Modeling
---
This notebook covers the data preprocessing and baseline model training steps. We will:
1. Select the features identified in the EDA phase.
2. Encode the target variable.
3. Split the data into stratified training and testing sets.
4. Apply Min-Max scaling to the features.
5. Train and evaluate several baseline classification models.

In [None]:
# ====================================================================
# 1. Feature and Target Selection
# ====================================================================
if not df.empty:
    # Based on the correlation analysis from Phase 1
    features = ['EC', 'Cl', 'TDS', 'Na']
    target = 'Water Quality Classification'

    X = df[features]
    y = df[target]

    print("Selected Features (X):")
    print(X.head())
    print("\nTarget Variable (y):")
    print(y.head())

In [None]:
# ====================================================================
# 2. Encode the Categorical Target Variable
# ====================================================================
if not df.empty:
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    print("\nOriginal Target Labels:", le.classes_)
    print("Encoded Target Labels:", np.unique(y_encoded))
    # Storing class names for later use in plots
    class_names = le.classes_

In [None]:
# ====================================================================
# 3. Stratified Train-Test Split
# ====================================================================
# We use an 80/20 split and stratify by the encoded target variable `y_encoded`
# to ensure the class distribution is the same in both train and test sets.
if not df.empty:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded,
        test_size=0.2,
        random_state=42,
        stratify=y_encoded
    )

    print("Shape of training data (X_train):", X_train.shape)
    print("Shape of testing data (X_test):", X_test.shape)

In [None]:
# ====================================================================
# 4. Feature Scaling (Min-Max Scaler)
# ====================================================================
if not df.empty:
    scaler = MinMaxScaler()

    # Fit the scaler on the training data and transform it
    X_train_scaled = scaler.fit_transform(X_train)

    # Use the same fitted scaler to transform the test data
    X_test_scaled = scaler.transform(X_test)

    # Convert scaled arrays back to DataFrames for clarity
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=features)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=features)

    print("\nScaled Training Data Head:")
    print(X_train_scaled.head())

In [None]:
# ====================================================================
# 5. Model Training and Baseline Evaluation
# ====================================================================

# Dictionary to hold the models we want to train
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

# Dictionary to store results
results = {}

if not df.empty:
    for name, model in models.items():
        print(f"--- Training {name} ---")
        
        # Train the model
        model.fit(X_train_scaled, y_train)
        
        # Make predictions on the test set
        y_pred = model.predict(X_test_scaled)
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, target_names=class_names)
        
        # Store results
        results[name] = accuracy
        
        print(f"\n✅ Results for {name}:")
        print(f"Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(report)
        
        # Visualize the Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)
        
        fig, ax = plt.subplots(figsize=(8, 6))
        disp.plot(ax=ax, cmap='Blues', xticks_rotation='vertical')
        ax.set_title(f'Confusion Matrix for {name}')
        
        # Save the figure
        cm_path = f'reports/figures/confusion_matrix_{name.replace(" ", "_")}.png'
        plt.savefig(cm_path, bbox_inches='tight')
        print(f"✅ Confusion matrix saved to {cm_path}")
        
        plt.show()

In [None]:
# ====================================================================
# 6. Compare Model Performance
# ====================================================================
if results:
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(results.keys()), y=list(results.values()), palette='mako')
    plt.title('Baseline Model Comparison by Accuracy')
    plt.xlabel('Model')
    plt.ylabel('Accuracy Score')
    plt.ylim(0, 1.0)
    plt.show()

    best_model_name = max(results, key=results.get)
    print(f"\n🏆 Best Performing Model (by Accuracy): {best_model_name} with an accuracy of {results[best_model_name]:.4f}")


In [None]:
# ====================================================================
# 7. Feature Importance Analysis (for best tree-based models)
# ====================================================================
# Let's analyze feature importance for Random Forest and Gradient Boosting

tree_models = {
    "Random Forest": models.get("Random Forest"),
    "Gradient Boosting": models.get("Gradient Boosting")
}

if tree_models["Random Forest"]: # Check if models were trained
    for name, model in tree_models.items():
        importances = model.feature_importances_
        feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
        feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
        
        plt.figure(figsize=(10, 5))
        sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
        plt.title(f'Feature Importance for {name}')
        
        # Save the figure
        fi_path = f'reports/figures/feature_importance_{name.replace(" ", "_")}.png'
        plt.savefig(fi_path, bbox_inches='tight')
        print(f"✅ Feature importance plot saved to {fi_path}")
        
        plt.show()

---
### End of Phase 2
*Summary & Next Steps:*
- We have successfully preprocessed the data and trained four baseline models.
- Based on the initial results, Random Forest and Gradient Boosting appear to be the strongest candidates.
- The next step (Phase 3) is to take the best-performing model and use **hyperparameter tuning** (`RandomizedSearchCV`) to optimize it for even better performance.
