In [1]:
import pandas as pd
import numpy as np

def generate_enhanced_dataset(n_samples=10000, save_path="enhanced_predictive_maintenance_dataset.csv"):
    """
    Generate an enhanced synthetic dataset for predictive maintenance with additional metrics.
    
    Parameters:
    n_samples (int): Number of data samples to generate
    save_path (str): Path to save the CSV file
    
    Returns:
    pandas.DataFrame: Generated dataset
    """
    np.random.seed(42)
    
    # Generate synthetic data with expanded metrics
    data = {
        'equipment_id': [f'EQ{str(i).zfill(5)}' for i in range(n_samples)],
        'timestamp': pd.date_range(start='2024-01-01', periods=n_samples, freq='30min'),
        'temperature': np.random.normal(70, 10, n_samples),      # °F, normal operation ~70
        'vibration': np.random.normal(0.5, 0.1, n_samples),     # mm/s
        'pressure': np.random.normal(100, 20, n_samples),       # psi
        'runtime_hours': np.random.exponential(1000, n_samples),# hours
        'current': np.random.normal(15, 3, n_samples),          # Amps, motor current
        'noise_level': np.random.normal(60, 10, n_samples),     # dB, equipment noise
        'oil_quality': np.random.beta(2, 5, n_samples) * 100,   # % cleanliness, 0-100
        'wear_rate': np.random.exponential(0.01, n_samples),    # mm/month
        'failure': np.zeros(n_samples, dtype=int)
    }
    
    # Simulate failure conditions based on expanded metrics
    for i in range(n_samples):
        if (data['temperature'][i] > 85 or 
            data['vibration'][i] > 0.7 or 
            data['pressure'][i] > 130 or 
            data['runtime_hours'][i] > 1500 or
            data['current'][i] > 20 or
            data['noise_level'][i] > 80 or
            data['oil_quality'][i] < 30 or
            data['wear_rate'][i] > 0.05):
            data['failure'][i] = 1 if np.random.rand() > 0.25 else 0
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Add realistic noise to simulate data imperfections
    df['temperature'] += np.random.normal(0, 2, n_samples)
    df['vibration'] += np.random.normal(0, 0.02, n_samples)
    df['pressure'] += np.random.normal(0, 5, n_samples)
    df['current'] += np.random.normal(0, 0.5, n_samples)
    df['noise_level'] += np.random.normal(0, 2, n_samples)
    df['oil_quality'] += np.random.normal(0, 5, n_samples)
    df['wear_rate'] += np.random.normal(0, 0.002, n_samples)
    
    # Clip values to ensure physical realism
    df['temperature'] = df['temperature'].clip(lower=0)
    df['vibration'] = df['vibration'].clip(lower=0)
    df['pressure'] = df['pressure'].clip(lower=0)
    df['runtime_hours'] = df['runtime_hours'].clip(lower=0)
    df['current'] = df['current'].clip(lower=0)
    df['noise_level'] = df['noise_level'].clip(lower=0)
    df['oil_quality'] = df['oil_quality'].clip(lower=0, upper=100)
    df['wear_rate'] = df['wear_rate'].clip(lower=0)
    
    # Save to CSV
    df.to_csv(save_path, index=False)
    print(f"Dataset saved to {save_path}")
    print(f"Dataset shape: {df.shape}")
    print(f"Failure rate: {df['failure'].mean():.2%}")
    print("\nDataset columns:", list(df.columns))
    
    return df

if __name__ == "__main__":
    dataset = generate_enhanced_dataset()
    print("\nFirst few rows of the dataset:")
    print(dataset.head())
    print("\nBasic statistics:")
    print(dataset.describe())

Dataset saved to enhanced_predictive_maintenance_dataset.csv
Dataset shape: (10000, 11)
Failure rate: 55.80%

Dataset columns: ['equipment_id', 'timestamp', 'temperature', 'vibration', 'pressure', 'runtime_hours', 'current', 'noise_level', 'oil_quality', 'wear_rate', 'failure']

First few rows of the dataset:
  equipment_id           timestamp  temperature  vibration    pressure  \
0      EQ00000 2024-01-01 00:00:00    72.611415   0.417893  107.486261   
1      EQ00001 2024-01-01 00:30:00    68.635069   0.478238  106.457127   
2      EQ00002 2024-01-01 01:00:00    76.190049   0.407369   83.809437   
3      EQ00003 2024-01-01 01:30:00    82.785237   0.505103  106.017007   
4      EQ00004 2024-01-01 02:00:00    66.507858   0.650506   67.944531   

   runtime_hours    current  noise_level  oil_quality  wear_rate  failure  
0    1783.009551  16.447449    45.750111    24.851429   0.025821        0  
1    2134.056179  15.114928    58.441357     9.652947   0.008965        0  
2     567.837737

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

def load_and_preprocess_data(file_path="enhanced_predictive_maintenance_dataset.csv"):
    """
    Load and preprocess the enhanced dataset for XGBoost.
    
    Parameters:
    file_path (str): Path to the dataset CSV
    
    Returns:
    Processed data, scaler, and feature names
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: {file_path} not found. Please generate the dataset using generate_enhanced_dataset.py.")
        return None, None, None, None, None, None
    
    # Select features and target
    features = ['temperature', 'vibration', 'pressure', 'runtime_hours', 
                'current', 'noise_level', 'oil_quality', 'wear_rate']
    X = df[features]
    y = df['failure']
    
    # Handle missing values
    X = X.fillna(X.mean())
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler, features

def train_xgboost_model(X_train, y_train):
    """
    Train an XGBoost classifier with optimized parameters.
    
    Parameters:
    X_train: Training features
    y_train: Training labels
    
    Returns:
    Trained XGBoost model
    """
    model = XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the XGBoost model with comprehensive metrics.
    
    Parameters:
    model: Trained XGBoost model
    X_test: Test features
    y_test: Test labels
    """
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    print("\nXGBoost Model Evaluation:")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix.png')
    plt.close()
    print("Confusion matrix saved as 'confusion_matrix.png'")

def plot_feature_importance(model, features):
    """
    Plot feature importance for the XGBoost model.
    
    Parameters:
    model: Trained XGBoost model
    features: List of feature names
    """
    importance = model.feature_importances_
    feature_importance = pd.DataFrame({'Feature': features, 'Importance': importance})
    feature_importance = feature_importance.sort_values(by='Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('XGBoost Feature Importance')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    print("Feature importance plot saved as 'feature_importance.png'")

def predict_new_data(model, scaler, new_data, features):
    """
    Predict failure probability for new sensor data.
    
    Parameters:
    model: Trained XGBoost model
    scaler: Fitted scaler
    new_data: New sensor data as a list or array
    features: List of feature names
    """
    new_data_df = pd.DataFrame([new_data], columns=features)
    new_data_scaled = scaler.transform(new_data_df)
    failure_prob = model.predict_proba(new_data_scaled)[0][1]
    
    print(f"\nPrediction for new equipment data:")
    print(f"Failure Probability: {failure_prob:.2%}")
    if failure_prob > 0.7:
        print("Recommendation: Schedule maintenance immediately.")
    elif failure_prob > 0.3:
        print("Recommendation: Monitor closely and consider maintenance soon.")
    else:
        print("Recommendation: Equipment appears stable.")

def main():
    # Load and preprocess data
    X_train, X_test, y_train, y_test, scaler, features = load_and_preprocess_data()
    if X_train is None:
        return
    
    # Train model
    print("Training XGBoost model...")
    xgb_model = train_xgboost_model(X_train, y_train)
    
    # Evaluate model
    evaluate_model(xgb_model, X_test, y_test)
    
    # Plot feature importance
    plot_feature_importance(xgb_model, features)
    
    # Example prediction for new sensor data
    # [temp, vib, press, runtime, curr, noise, oil, wear]
    new_data = [78, 0.65, 115, 1300, 17, 70, 65, 0.03]
    predict_new_data(xgb_model, scaler, new_data, features)

if __name__ == "__main__":
    main()

Training XGBoost model...

XGBoost Model Evaluation:
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.53      0.64       892
           1       0.71      0.91      0.79      1108

    accuracy                           0.74      2000
   macro avg       0.76      0.72      0.72      2000
weighted avg       0.76      0.74      0.73      2000

ROC-AUC Score: 0.7567
Confusion matrix saved as 'confusion_matrix.png'
Feature importance plot saved as 'feature_importance.png'

Prediction for new equipment data:
Failure Probability: 4.19%
Recommendation: Equipment appears stable.


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score
import pickle
import warnings
warnings.filterwarnings('ignore')

def load_and_preprocess_data(file_path="enhanced_predictive_maintenance_dataset.csv"):
    """
    Load and preprocess the enhanced dataset for XGBoost.
    
    Parameters:
    file_path (str): Path to the dataset CSV
    
    Returns:
    Processed data, scaler, and feature names
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: {file_path} not found. Please generate the dataset using generate_enhanced_dataset.py.")
        return None, None, None, None, None, None
    
    # Select features and target
    features = ['temperature', 'vibration', 'pressure', 'runtime_hours', 
                'current', 'noise_level', 'oil_quality', 'wear_rate']
    X = df[features]
    y = df['failure']
    
    # Handle missing values
    X = X.fillna(X.mean())
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test, scaler, features

def train_xgboost_model(X_train, y_train):
    """
    Train an XGBoost classifier with optimized parameters.
    
    Parameters:
    X_train: Training features
    y_train: Training labels
    
    Returns:
    Trained XGBoost model
    """
    model = XGBClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss'
    )
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the XGBoost model with comprehensive metrics.
    
    Parameters:
    model: Trained XGBoost model
    X_test: Test features
    y_test: Test labels
    """
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    print("\nXGBoost Model Evaluation:")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

def save_model_and_scaler(model, scaler, model_path="xgboost_model.pkl", scaler_path="scaler.pkl"):
    """
    Save the trained model and scaler as pickle files.
    
    Parameters:
    model: Trained XGBoost model
    scaler: Fitted scaler
    model_path (str): Path to save the model pickle file
    scaler_path (str): Path to save the scaler pickle file
    """
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    with open(scaler_path, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"\nModel saved to {model_path}")
    print(f"Scaler saved to {scaler_path}")

def main():
    # Load and preprocess data
    X_train, X_test, y_train, y_test, scaler, features = load_and_preprocess_data()
    if X_train is None:
        return
    
    # Train model
    print("Training XGBoost model...")
    xgb_model = train_xgboost_model(X_train, y_train)
    
    # Evaluate model
    evaluate_model(xgb_model, X_test, y_test)
    
    # Save model and scaler to pickle files
    save_model_and_scaler(xgb_model, scaler)
    
    # Demonstrate loading and predicting with saved model
    with open("xgboost_model.pkl", 'rb') as f:
        loaded_model = pickle.load(f)
    with open("scaler.pkl", 'rb') as f:
        loaded_scaler = pickle.load(f)
    
    # Example prediction with loaded model
    new_data = [78, 0.65, 115, 1300, 17, 70, 65, 0.03]  # [temp, vib, press, runtime, curr, noise, oil, wear]
    new_data_df = pd.DataFrame([new_data], columns=features)
    new_data_scaled = loaded_scaler.transform(new_data_df)
    failure_prob = loaded_model.predict_proba(new_data_scaled)[0][1]
    
    print(f"\nPrediction for new equipment data using loaded model:")
    print(f"Failure Probability: {failure_prob:.2%}")
    if failure_prob > 0.7:
        print("Recommendation: Schedule maintenance immediately.")
    elif failure_prob > 0.3:
        print("Recommendation: Monitor closely and consider maintenance soon.")
    else:
        print("Recommendation: Equipment appears stable.")

if __name__ == "__main__":
    main()

Training XGBoost model...

XGBoost Model Evaluation:
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.53      0.64       892
           1       0.71      0.91      0.79      1108

    accuracy                           0.74      2000
   macro avg       0.76      0.72      0.72      2000
weighted avg       0.76      0.74      0.73      2000

ROC-AUC Score: 0.7567

Model saved to xgboost_model.pkl
Scaler saved to scaler.pkl

Prediction for new equipment data using loaded model:
Failure Probability: 4.19%
Recommendation: Equipment appears stable.
