In [None]:
## Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import io
import base64
from IPython.display import display, Image
from pathlib import Path

In [None]:
# Create output directory
output_dir = 'output'
Path(output_dir).mkdir(parents=True, exist_ok=True)
print(f'Output directory created/verified at: {os.path.abspath(output_dir)}')

In [None]:
# Load and prepare data
try:
    data = pd.read_csv('Data.csv')
    if 'Unnamed: 0' in data.columns:
        data = data.drop(columns=['Unnamed: 0'])
    data = data.dropna()
    print('Data loaded successfully.')
    print('Missing Values:\n', data.isnull().sum())
    print('\nClass Distribution:\n', data['Label'].value_counts(normalize=True))
    X = data[['Chloride', 'Organic_Carbon', 'Solids', 'Sulphate', 'Turbidity', 'ph']]
    y = data['Label']
except FileNotFoundError:
    print('Error: Data.csv not found. Please ensure the file exists in the working directory.')
except KeyError as e:
    print(f'Error: Column {e} not found in Data.csv. Required columns: Chloride, Organic_Carbon, Solids, Sulphate, Turbidity, ph, Label')

In [None]:
# Split and scale data
try:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    joblib.dump(scaler, os.path.join(output_dir, 'scaler.pkl'))
    print('Data split and scaled successfully.')
    print(f"Scaler saved as '{os.path.join(output_dir, 'scaler.pkl')}'")
except NameError:
    print('Error: X or y not defined. Ensure the previous cell ran successfully.')

In [None]:
# Define and train models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

for name, model in models.items():
    try:
        model.fit(X_train_scaled, y_train)
        print(f'{name} trained successfully.')
    except Exception as e:
        print(f'Error training {name}: {e}')

In [None]:
# Evaluate models
results = {}
try:
    for name, model in models.items():
        y_pred = model.predict(X_test_scaled)
        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        sensitivity = recall
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        
        results[name] = {
            'Precision': precision,
            'Recall': recall,
            'Accuracy': accuracy,
            'F1-Score': f1,
            'Sensitivity': sensitivity,
            'Specificity': specificity,
            'Confusion Matrix': cm.tolist(),
            'Model': model
        }
        
        print(f'\n{name}:')
        print(f'Precision: {precision:.4f}')
        print(f'Recall: {recall:.4f}')
        print(f'Accuracy: {accuracy:.4f}')
        print(f'F1-Score: {f1:.4f}')
        print(f'Sensitivity: {sensitivity:.4f}')
        print(f'Specificity: {specificity:.4f}')
        print(f'Confusion Matrix:\n{cm}')
    print('\nResults dictionary:', results)
except Exception as e:
    print(f'Error evaluating models: {e}')

In [None]:
# Save best model
try:
    best_model_name = max(results, key=lambda x: results[x]['F1-Score'])
    best_model = results[best_model_name]['Model']
    joblib.dump(best_model, os.path.join(output_dir, 'best_water_quality_model.pkl'))
    print(f'Best model ({best_model_name}) saved as {os.path.join(output_dir, "best_water_quality_model.pkl")}')
except ValueError:
    print('Error: No models evaluated. Ensure the previous cell ran successfully.')

In [None]:
# Visualize confusion matrices
try:
    if not results:
        raise ValueError('Results accounting_results dictionary is empty. Ensure models were evaluated successfully.')
    fig, axes = plt.subplots(2, 2, figsize=(12, 10))
    fig.suptitle('Confusion Matrices for Water Quality Classification', fontsize=16)
    axes = axes.ravel()
    for idx, (name, metrics) in enumerate(results.items()):
        sns.heatmap(metrics['Confusion Matrix'], annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                    cbar=False, annot_kws={'size': 12})
        axes[idx].set_title(name)
        axes[idx].set_xlabel('Predicted')
        axes[idx].set_ylabel('Actual')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    
    # Save to PNG file
    cm_output_path = os.path.join(output_dir, 'confusion_matrices.png')
    plt.savefig(cm_output_path, bbox_inches='tight', dpi=300)
    
    # Embed plot in notebook JSON output as base64
    buffer = io.BytesIO()
    plt.savefig(buffer, format='png', bbox_inches='tight', dpi=300)
    buffer.seek(0)
    cm_img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
    buffer.close()
    plt.close()
    display(Image(data=base64.b64decode(cm_img_base64)))
    print(f'Confusion matrices plot saved as {cm_output_path} and embedded in notebook output.')
except Exception as e:
    print(f'Error generating confusion matrices plot: {e}')

In [None]:
# Visualize model comparison
try:
    if not results:
        raise ValueError('Results dictionary is empty. Ensure models were evaluated successfully.')
    metrics_df = pd.DataFrame({
        'Model': [name for name in results.keys() for _ in range(6)],
        'Metric': ['Precision', 'Recall', 'Accuracy', 'F1-Score', 'Sensitivity', 'Specificity'] * len(results),
        'Value': [results[name][metric] for name in results for metric in ['Precision', 'Recall', 'Accuracy', 'F1-Score', 'Sensitivity', 'Specificity']]
    })
    plt.figure(figsize=(12, 6))
    sns.barplot(x='Metric', y='Value', hue='Model', data=metrics_df)
    plt.title('Model Performance Comparison', fontsize=16)
    plt.ylim(0, 1)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    
    # Save to PNG file
    mc_output_path = os.path.join(output_dir, 'model_comparison.png')
    plt.savefig(mc_output_path, bbox_inches='tight', dpi=300)
    
    # Embed plot in notebook JSON output as base64
    buffer = io.BytesIO()
    plt.savefig(buffer, format='png', bbox_inches='tight', dpi=300)
    buffer.seek(0)
    mc_img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
    buffer.close()
    plt.close()
    display(Image(data=base64.b64decode(mc_img_base64)))
    print(f'Model comparison plot saved as {mc_output_path} and embedded in notebook output.')
except Exception as e:
    print(f'Error generating model comparison plot: {e}')

In [None]:
# Visualize feature importance for Random Forest
try:
    if 'Random Forest' not in results:
        raise ValueError('Random Forest model not found in results.')
    plt.figure(figsize=(8, 5))
    plt.suptitle('Feature Importance for Water Quality Classification', fontsize=16)
    rf_model = results['Random Forest']['Model']
    rf_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
    rf_importance.sort_values(ascending=False).plot(kind='bar', color='skyblue')
    plt.title('Random Forest Feature Importance')
    plt.ylabel('Importance')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    
    # Save to PNG file
    fi_output_path = os.path.join(output_dir, 'feature_importance.png')
    plt.savefig(fi_output_path, bbox_inches='tight', dpi=300)
    
    # Embed plot in notebook JSON output as base64
    buffer = io.BytesIO()
    plt.savefig(buffer, format='png', bbox_inches='tight', dpi=300)
    buffer.seek(0)
    fi_img_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
    buffer.close()
    plt.close()
    display(Image(data=base64.b64decode(fi_img_base64)))
    print(f'Feature importance plot saved as {fi_output_path} and embedded in notebook output.')
except Exception as e:
    print(f'Error generating feature importance plot: {e}')