In [None]:
## Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
from pathlib import Path
# Optional: For Chart.js PNG export
from playwright.sync_api import sync_playwright
import json

In [None]:
# Create output directory
output_dir = 'output'
Path(output_dir).mkdir(parents=True, exist_ok=True)
print(f'Output directory created/verified at: {os.path.abspath(output_dir)}')

In [None]:
# Load and prepare data
try:
    data = pd.read_csv('Data.csv')
    if 'Unnamed: 0' in data.columns:
        data = data.drop(columns=['Unnamed: 0'])
    data = data.dropna()
    print('Data loaded successfully.')
    print('Missing Values:\n', data.isnull().sum())
    print('\nClass Distribution:\n', data['Label'].value_counts(normalize=True))
    X = data[['Chloride', 'Organic_Carbon', 'Solids', 'Sulphate', 'Turbidity', 'ph']]
    y = data['Label']
except FileNotFoundError:
    print('Error: Data.csv not found. Please ensure the file exists in the working directory.')
except KeyError as e:
    print(f'Error: Column {e} not found in Data.csv. Required columns: Chloride, Organic_Carbon, Solids, Sulphate, Turbidity, ph, Label')

In [None]:
# Split and scale data
try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    joblib.dump(scaler, os.path.join(output_dir, 'scaler.pkl'))
    print('Data split and scaled successfully.')
except NameError:
    print('Error: X or y not defined. Ensure the previous cell ran successfully.')

In [None]:
# Define and train models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced'),
    'SVM': SVC(random_state=42, probability=True, class_weight='balanced'),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

for name, model in models.items():
    try:
        model.fit(X_train_scaled, y_train)
        print(f'{name} trained successfully.')
    except Exception as e:
        print(f'Error training {name}: {e}')

In [None]:
# Evaluate models
results = {}
for name, model in models.items():
    try:
        y_pred = model.predict(X_test_scaled)
        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()
        
        results[name] = {
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'Accuracy': accuracy_score(y_test, y_pred),
            'F1-Score': f1_score(y_test, y_pred),
            'Confusion Matrix': cm.tolist()
        }
        
        print(f'\n{name}:')
        for metric, value in results[name].items():
            if metric != 'Confusion Matrix':
                print(f'{metric}: {value:.4f}')
    except Exception as e:
        print(f'Error evaluating {name}: {e}')

# Print results for debugging
print('\nResults dictionary:', results)

In [None]:
# Save best model
try:
    best_model_name = max(results, key=lambda x: results[x]['F1-Score'])
    joblib.dump(models[best_model_name], os.path.join(output_dir, f'best_model_{best_model_name.replace(" ", "_")}.pkl'))
    print(f'\nBest model ({best_model_name}) saved')
except ValueError:
    print('Error: No models evaluated. Ensure the previous cell ran successfully.')

In [None]:
# Visualization (Matplotlib/Seaborn)
try:
    if not results:
        raise ValueError('Results dictionary is empty. Ensure models were evaluated successfully.')
    plt.figure(figsize=(12, 6))
    metrics_df = pd.DataFrame({
        'Model': [name for name in results.keys() for _ in range(4)],
        'Metric': ['Precision', 'Recall', 'Accuracy', 'F1-Score'] * len(results),
        'Value': [results[name][metric] for name in results for metric in ['Precision', 'Recall', 'Accuracy', 'F1-Score']]
    })
    sns.barplot(x='Metric', y='Value', hue='Model', data=metrics_df)
    plt.title('Model Performance Comparison')
    plt.ylim(0, 1)
    output_path = os.path.join(output_dir, 'model_comparison.png')
    plt.savefig(output_path, bbox_inches='tight', dpi=300)
    plt.close()
    print(f'Plot saved successfully as {output_path}')
except Exception as e:
    print(f'Error generating Matplotlib plot: {e}')

# Optional: Chart.js Visualization and PNG Export
try:
    chart_config = {
        'type': 'bar',
        'data': {
            'labels': ['Precision', 'Recall', 'Accuracy', 'F1-Score'],
            'datasets': [
                {
                    'label': name,
                    'data': [results[name][metric] for metric in ['Precision', 'Recall', 'Accuracy', 'F1-Score']],
                    'backgroundColor': f'rgba({(i*50 % 255)}, {(i*100 % 255)}, {(i*150 % 255)}, 0.6)',
                    'borderColor': f'rgba({(i*50 % 255)}, {(i*100 % 255)}, {(i*150 % 255)}, 1)',
                    'borderWidth': 1
                } for i, name in enumerate(results.keys())
            ]
        },
        'options': {
            'plugins': {
                'title': {'display': True, 'text': 'Model Performance Comparison', 'font': {'size': 18}},
                'legend': {'position': 'top'}
            },
            'scales': {
                'y': {'beginAtZero': True, 'max': 1, 'title': {'display': True, 'text': 'Value'}},
                'x': {'title': {'display': True, 'text': 'Metric'}}
            }
        }
    }
    # Save Chart.js config
    chart_config_path = os.path.join(output_dir, 'chart_config.json')
    with open(chart_config_path, 'w') as f:
        json.dump(chart_config, f)

    # HTML to render Chart.js chart
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <script src='https://cdn.jsdelivr.net/npm/chart.js'></script>
    </head>
    <body>
        <canvas id='myChart' width='800' height='400'></canvas>
        <script>
            fetch('{chart_config_path}')
                .then(response => response.json())
                .then(config => new Chart(document.getElementById('myChart'), config));
        </script>
    </body>
    </html>
    """
    html_path = os.path.join(output_dir, 'chart.html')
    with open(html_path, 'w') as f:
        f.write(html_content)

    # Use Playwright to save PNG
    with sync_playwright() as p:
        browser = p.chromium.launch()
        page = browser.new_page()
        page.goto(f'file://{os.path.abspath(html_path)}')
        page.wait_for_timeout(1000)
        chart_png_path = os.path.join(output_dir, 'model_comparison_chartjs.png')
        page.screenshot(path=chart_png_path)
        browser.close()
    print(f'Chart.js chart saved as {chart_png_path}')
except Exception as e:
    print(f'Error generating Chart.js chart: {e}')