# Model Training Pipeline - Factory Guard AI

This notebook demonstrates building baseline models with Scikit-Learn and advanced models with XGBoost/LightGBM.

## 1. Setup and Data Preparation

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

from src.data.loader import DataLoader
from src.data.preprocessing import Preprocessor
from src.models.trainer import ModelTrainer
from src.utils.config import Logger

logger = Logger()
plt.style.use('seaborn-v0_8-darkgrid')

## 2. Create and Prepare Dataset

In [None]:
# Create sample dataset
np.random.seed(42)
n_samples = 1000

df = pd.DataFrame({
    'temperature': np.random.normal(98.6, 5, n_samples),
    'pressure': np.random.normal(1013.25, 10, n_samples),
    'vibration': np.random.exponential(2, n_samples),
    'humidity': np.random.uniform(30, 80, n_samples),
    'power_consumption': np.random.normal(500, 100, n_samples)
})

# Create target (anomaly detection)
df['is_anomaly'] = ((df['temperature'] > 110) | (df['vibration'] > 8)).astype(int)

# Separate features and target
X = df.drop('is_anomaly', axis=1)
y = df['is_anomaly']

logger.info(f"Dataset shape: {X.shape}")
logger.info(f"Target distribution: {y.value_counts().to_dict()}")

## 3. Train-Test Split

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print(f"Train set size: {X_train_scaled.shape}")
print(f"Test set size: {X_test_scaled.shape}")

## 4. Baseline Models with Scikit-Learn

In [None]:
# Train baseline models
models = {
    'Logistic Regression': ModelTrainer(model_type='logistic_regression', max_iter=1000, random_state=42),
    'SVM': ModelTrainer(model_type='svm', kernel='rbf', random_state=42),
    'Random Forest': ModelTrainer(model_type='random_forest', n_estimators=100, random_state=42)
}

baseline_results = {}

for name, trainer in models.items():
    logger.info(f"Training {name}...")
    trainer.train(X_train_scaled, y_train)
    metrics = trainer.evaluate(X_test_scaled, y_test)
    baseline_results[name] = metrics
    print(f"\n{name} Results:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

## 5. Gradient Boosting Models (XGBoost/LightGBM)

In [None]:
try:
    # XGBoost
    logger.info("Training XGBoost...")
    xgb_trainer = ModelTrainer(
        model_type='xgboost',
        learning_rate=0.1,
        max_depth=6,
        n_estimators=100,
        random_state=42
    )
    xgb_trainer.train(X_train_scaled, y_train)
    xgb_metrics = xgb_trainer.evaluate(X_test_scaled, y_test)
    baseline_results['XGBoost'] = xgb_metrics
    
    print("\nXGBoost Results:")
    for metric, value in xgb_metrics.items():
        print(f"  {metric}: {value:.4f}")
except Exception as e:
    logger.warning(f"XGBoost training failed: {e}")

In [None]:
try:
    # LightGBM
    logger.info("Training LightGBM...")
    lgb_trainer = ModelTrainer(
        model_type='lightgbm',
        learning_rate=0.1,
        num_leaves=31,
        n_estimators=100,
        random_state=42
    )
    lgb_trainer.train(X_train_scaled, y_train)
    lgb_metrics = lgb_trainer.evaluate(X_test_scaled, y_test)
    baseline_results['LightGBM'] = lgb_metrics
    
    print("\nLightGBM Results:")
    for metric, value in lgb_metrics.items():
        print(f"  {metric}: {value:.4f}")
except Exception as e:
    logger.warning(f"LightGBM training failed: {e}")

## 6. Model Comparison

In [None]:
# Compare all models
comparison_df = pd.DataFrame(baseline_results).T
print("\n=== Model Comparison ===")
print(comparison_df.round(4))

# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy comparison
comparison_df['accuracy'].plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_title('Accuracy Comparison')
axes[0].set_xlabel('Accuracy Score')

# F1 Score comparison
comparison_df['f1'].plot(kind='barh', ax=axes[1], color='coral')
axes[1].set_title('F1 Score Comparison')
axes[1].set_xlabel('F1 Score')

plt.tight_layout()
plt.show()

best_model = comparison_df['accuracy'].idxmax()
logger.info(f"Best model: {best_model} (Accuracy: {comparison_df.loc[best_model, 'accuracy']:.4f})")

## 7. Cross-Validation Analysis