In [None]:
## Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

In [None]:
# Create output directory
os.makedirs('output', exist_ok=True)

In [None]:
# Load and prepare data
data = pd.read_csv('Data.csv')
if 'Unnamed: 0' in data.columns:
    data = data.drop(columns=['Unnamed: 0'])
data = data.dropna()

print("Missing Values:\n", data.isnull().sum())
print("\nClass Distribution:\n", data['Label'].value_counts(normalize=True))

X = data[['Chloride', 'Organic_Carbon', 'Solids', 'Sulphate', 'Turbidity', 'ph']]
y = data['Label']

In [None]:
# Split and scale data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, 'output/scaler.pkl')

In [None]:
# Define and train models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced'),
    'SVM': SVC(random_state=42, probability=True, class_weight='balanced'),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    print(f"{name} trained successfully.")

In [None]:
# Evaluate models
results = {}
for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    results[name] = {
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'Confusion Matrix': cm.tolist()  # Convert numpy array to list for JSON
    }
    
    print(f"\n{name}:")
    for metric, value in results[name].items():
        if metric != 'Confusion Matrix':
            print(f"{metric}: {value:.4f}")

In [None]:
# Save best model
best_model_name = max(results, key=lambda x: results[x]['F1-Score'])
joblib.dump(models[best_model_name], f'output/best_model_{best_model_name.replace(" ", "_")}.pkl')
print(f"\nBest model ({best_model_name}) saved")

In [None]:
# Visualization
plt.figure(figsize=(12, 6))
metrics_df = pd.DataFrame({
    'Model': [name for name in results.keys() for _ in range(4)],
    'Metric': ['Precision', 'Recall', 'Accuracy', 'F1-Score'] * len(results),
    'Value': [results[name][metric] for name in results for metric in ['Precision', 'Recall', 'Accuracy', 'F1-Score']]
})

sns.barplot(x='Metric', y='Value', hue='Model', data=metrics_df)
plt.title('Model Performance Comparison')
plt.ylim(0, 1)
plt.savefig('output/model_comparison.png', bbox_inches='tight')
plt.close()