In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import logging
import joblib
import shap
import optuna

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    handlers=[logging.FileHandler("model_training.log"), logging.StreamHandler()]
)

# Load processed data from CSV files
def load_processed_data():
    try:
        cctv_data = pd.read_csv('processed_cctv_data.csv')
        access_data = pd.read_csv('processed_access_data.csv')
        intercom_data = pd.read_csv('processed_intercom_data.csv')
        logging.info("Successfully loaded processed data.")
        return cctv_data, access_data, intercom_data
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        raise

# Data split into features and labels
def split_data(cctv_data):
    try:
        X = cctv_data[['motion_detected', 'is_online', 'hour_of_day']]
        y = cctv_data['label_failure']
        logging.info("Data successfully split into features and labels.")
        return X, y
    except Exception as e:
        logging.error(f"Error splitting data: {e}")
        raise

# Function to handle class imbalance with SMOTE
def handle_imbalance(X_train, y_train):
    try:
        logging.info("Handling class imbalance using SMOTE...")
        sm = SMOTE(random_state=42)
        X_res, y_res = sm.fit_resample(X_train, y_train)
        logging.info(f"Class distribution after SMOTE: {np.bincount(y_res)}")
        return X_res, y_res
    except Exception as e:
        logging.error(f"Error during imbalance handling: {e}")
        raise

# Function to visualize data distribution
def plot_data_distribution(y):
    sns.countplot(y)
    plt.title("Data Distribution (Failures vs Non-Failures)")
    plt.xlabel("Failure (1 = Yes, 0 = No)")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

# Optuna objective function for hyperparameter tuning
def optuna_rf_objective(trial, X_train, y_train):
    param_grid = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    rf = RandomForestClassifier(random_state=42, **param_grid)
    skf = StratifiedKFold(n_splits=5)
    scores = []

    for train_index, val_index in skf.split(X_train, y_train):
        X_t, X_v = X_train.iloc[train_index], X_train.iloc[val_index]
        y_t, y_v = y_train.iloc[train_index], y_train.iloc[val_index]
        rf.fit(X_t, y_t)
        y_pred = rf.predict(X_v)
        scores.append(accuracy_score(y_v, y_pred))

    return np.mean(scores)

# Optuna hyperparameter tuning
def tune_random_forest_optuna(X_train, y_train):
    logging.info("Tuning Random Forest with Optuna...")
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: optuna_rf_objective(trial, X_train, y_train), n_trials=50)

    logging.info(f"Best Random Forest parameters: {study.best_params}")
    best_params = study.best_params
    rf_best = RandomForestClassifier(random_state=42, **best_params)
    rf_best.fit(X_train, y_train)
    
    return rf_best

# Train Logistic Regression with class weighting for imbalance
def train_logistic_regression(X_train, y_train):
    logging.info("Training Logistic Regression with class weighting...")
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    lr = LogisticRegression(max_iter=1000, random_state=42, class_weight=dict(enumerate(class_weights)))
    lr.fit(X_train, y_train)
    return lr

# Model evaluation metrics
def evaluate_model(y_true, y_pred, y_prob):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)

    logging.info(f"Model Evaluation:\nAccuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")
    return accuracy, precision, recall, f1, roc_auc

# Confusion matrix visualization
def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Failure', 'Failure'], yticklabels=['Non-Failure', 'Failure'])
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.show()

# ROC curve visualization
def plot_roc_curve(y_true, y_prob):
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc_score(y_true, y_prob))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.show()

# SHAP-based model interpretability
def explain_model_with_shap(model, X_train):
    logging.info("Generating SHAP values for model interpretability...")
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_train)
    shap.summary_plot(shap_values[1], X_train, feature_names=X_train.columns)

# Train and evaluate models
def train_and_evaluate(X_train, X_test, y_train, y_test):
    try:
        # Handle class imbalance
        X_resampled, y_resampled = handle_imbalance(X_train, y_train)

        # Train Random Forest with Optuna tuning
        rf_best = tune_random_forest_optuna(X_resampled, y_resampled)
        
        # Train Logistic Regression
        lr_model = train_logistic_regression(X_resampled, y_resampled)

        # Predictions for Random Forest
        y_pred_rf = rf_best.predict(X_test)
        y_prob_rf = rf_best.predict_proba(X_test)[:, 1]

        # Predictions for Logistic Regression
        y_pred_lr = lr_model.predict(X_test)
        y_prob_lr = lr_model.predict_proba(X_test)[:, 1]

        # Evaluate Random Forest
        logging.info("Evaluating Random Forest...")
        rf_metrics = evaluate_model(y_test, y_pred_rf, y_prob_rf)
        plot_confusion_matrix(y_test, y_pred_rf)
        plot_roc_curve(y_test, y_prob_rf)

        # SHAP explanations for Random Forest
        explain_model_with_shap(rf_best, X_train)

        # Evaluate Logistic Regression
        logging.info("Evaluating Logistic Regression...")
        lr_metrics = evaluate_model(y_test, y_pred_lr, y_prob_lr)
        plot_confusion_matrix(y_test, y_pred_lr)
        plot_roc_curve(y_test, y_prob_lr)

        # Save best models
        joblib.dump(rf_best, 'best_random_forest_model.pkl')
        joblib.dump(lr_model, 'logistic_regression_model.pkl')
        logging.info("Models successfully saved.")

        return rf_metrics, lr_metrics
    except Exception as e:
        logging.error(f"Error during model training and evaluation: {e}")
        raise

# Main function
def main():
    try:
        # Load and split the data
        cctv_data, access_data, intercom_data = load_processed_data()
        X, y = split_data(cctv_data)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

        # Plot data distribution
        plot_data_distribution(y)

        # Train and evaluate the models
        rf_metrics, lr_metrics = train_and_evaluate(X_train, X_test, y_train, y_test)

        logging.info(f"Random Forest Metrics: {rf_metrics}")
        logging.info(f"Logistic Regression Metrics: {lr_metrics}")

    except Exception as e:
        logging.error(f"Error in main function: {e}")
        raise

if __name__ == "__main__":
    main()
