In [None]:
# ======================
# 0. Import Libraries
# ======================
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import os

warnings.filterwarnings("ignore")

## Phase 1: Foundation & Data Acquisition
---
This notebook covers the initial Exploratory Data Analysis (EDA) for the Water Quality dataset. The key objectives are to understand the data's structure, identify key features for modeling, and check for any data quality issues.


In [None]:
# ======================
# 1. Create Directories and Load Dataset
# ======================

# Create directories for saving reports and figures if they don't exist
if not os.path.exists('reports/figures'):
    os.makedirs('reports/figures')

# Load the dataset using a relative path based on the project structure
try:
    df = pd.read_csv("data/raw/water_quality.csv")
    print("✅ Dataset loaded successfully.")
except FileNotFoundError:
    print("❌ Error: 'data/raw/water_quality.csv' not found.")
    print("Please ensure the dataset is in the correct directory.")
    # As a fallback for demonstration, creating a dummy dataframe
    df = pd.DataFrame() 


In [None]:
# ======================
# 2. Basic Dataset Overview
# ======================
if not df.empty:
    print("📊 Shape of Dataset:", df.shape)
    print("\n🧾 Dataset Info:")
    df.info()
    print("\n📈 Summary Statistics:")
    print(df.describe())
    print("\n📋 Columns in Dataset:")
    print(df.columns.tolist())
    
    # Explicitly check for missing values
    print("\n❓ Missing Values Check:")
    print(df.isnull().sum())

*Observation*: The `df.info()` and `df.isnull().sum()` outputs confirm that there are **no missing values** in the dataset, which simplifies the preprocessing stage.


In [None]:
# ======================
# 3. Identify Feature Types
# ======================
if not df.empty:
    categorical = df.select_dtypes(include='object').columns.tolist()
    numerical = df.select_dtypes(include='number').columns.tolist()

    print("\n🔢 Numerical Columns:", numerical)
    print("🔠 Categorical Columns:", categorical)

In [None]:
# ======================
# 4. Identifier Check
# ======================
if not df.empty:
    print("\n🆔 Checking for Identifier Columns...")
    is_identifier_found = False
    for col in df.columns:
        if df[col].is_unique:
            print(f"'{col}' is likely an identifier column as all its values are unique.")
            is_identifier_found = True
    if not is_identifier_found:
        print("No identifier columns found.")

## 5. Correlation and Feature Selection Insight
---
We analyze the correlation of numerical features with the `Water Quality Index (WQI)`. This helps in identifying the most influential features for predicting water quality, which is crucial for our feature selection process.


In [None]:
# ======================
# 5.1. WQI Correlation Analysis
# ======================
if 'WQI' in df.columns:
    correlation = df.corr(numeric_only=True)
    wqi_corr = correlation['WQI'].sort_values(ascending=False)
    print("\n🔗 Correlation with WQI:")
    print(wqi_corr)

    # Heatmap Visualization
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation, annot=True, cmap="coolwarm", fmt='.2f')
    plt.title("Correlation Heatmap of Water Quality Features")
    
    # Save the figure before showing it
    plt.savefig('reports/figures/correlation_heatmap.png', bbox_inches='tight')
    print("\n✅ Correlation heatmap saved to 'reports/figures/correlation_heatmap.png'")
    
    plt.show()

else:
    print("⚠️ WQI column not found. Please verify dataset.")

*Finding*: The correlation analysis reveals that **Electrical Conductivity (EC), Total Dissolved Solids (TDS), Chloride (Cl), and Sodium (Na)** have the strongest correlations with the WQI. Based on this insight, these four features will be selected for building our classification model.


 ## 6. Target Variable Analysis
 ---
 Here, we examine the distribution of our target variable, `Water Quality Classification`, to understand the class balance. This is critical because a significant imbalance might require special handling techniques like stratified sampling or resampling.


In [None]:
# ======================
# 6.1. Target Variable Distribution
# ======================
if 'Water Quality Classification' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.countplot(x='Water Quality Classification', data=df, palette='viridis', order=df['Water Quality Classification'].value_counts().index)
    plt.title('Distribution of Water Quality Classification')
    plt.xlabel('Water Quality Classification')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    
    # Save the figure
    plt.savefig('reports/figures/target_variable_distribution.png', bbox_inches='tight')
    print("\n✅ Target variable distribution plot saved to 'reports/figures/target_variable_distribution.png'")
    
    plt.show()
    
    print("\n🎯 Class Balance:")
    print(df['Water Quality Classification'].value_counts())
else:
    print("⚠️ 'Water Quality Classification' column not found.")

*Observation*: The classes are imbalanced. "Good" and "Excellent" have the highest counts, while other categories are less represented. This justifies the use of **stratified sampling** during the train-test split to ensure proportional representation of each class in both sets.


 ## 7. Numerical Feature Distribution Analysis
 ---
 We will now visualize the distributions of all numerical features to check for skewness and potential outliers.


In [None]:
# ======================
# 7.1. Visualize Numerical Feature Distributions
# ======================
if not df.empty:
    print("\n📊 Visualizing Numerical Feature Distributions...")
    for col in numerical:
        plt.figure(figsize=(12, 4))
        
        # Histogram
        plt.subplot(1, 2, 1)
        sns.histplot(df[col], kde=True, color='skyblue')
        plt.title(f'Histogram of {col}')
        
        # Box Plot
        plt.subplot(1, 2, 2)
        sns.boxplot(y=df[col], color='lightgreen')
        plt.title(f'Box Plot of {col}')
        
        # Save the combined plot
        figure_path = f'reports/figures/distribution_{col}.png'
        plt.savefig(figure_path, bbox_inches='tight')
        
        plt.tight_layout()
        plt.show()

    print("\n✅ All numerical distribution plots saved to 'reports/figures/'.")

 ---
 ### **End of Phase 1 EDA**
 **Summary of Findings:**
 - The dataset is complete with no missing values.
 - The features most correlated with WQI are `EC`, `TDS`, `Cl`, and `Na`. These will be our predictors.
 - The target variable, `Water Quality Classification`, is imbalanced, necessitating the use of stratified sampling.
 - The distribution plots show that some features are skewed and contain outliers, confirming that feature scaling (e.g., Min-Max or Standard scaling) will be an essential step in Phase 2.
 ---
