In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report
)
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set(color_codes=True)
plt.rcParams['figure.figsize'] = (15, 10)

# ============================================================================
# 1. DATA PREPARATION
# ============================================================================

print("="*70)
print("HYDRAULIC SYSTEM CONDITION MONITORING - RANDOM FOREST APPROACH")
print("="*70)

# Download and extract data (uncomment if needed)
# For non-Jupyter environments, use urllib instead of wget
import urllib.request
import zipfile
import os

if not os.path.exists('data.zip'):
    print("\nDownloading data...")
    urllib.request.urlretrieve(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/00447/data.zip',
        'data.zip'
    )
    print("‚úì Download complete!")

if not os.path.exists('profile.txt'):
    print("Extracting files...")
    with zipfile.ZipFile('data.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
    print("‚úì Extraction complete!")

# Import sensor values as features
sensor_names = ["PS1", "PS2", "PS3", "PS4", "PS5", "PS6",
                "EPS1", "FS1", "FS2", "TS1", "TS2", "TS3",
                "TS4", "VS1", "CE", "CP", "SE"]

print("\nLoading sensor data...")
feature_list = []
for sensor in sensor_names:
    data = np.genfromtxt(f"{sensor}.txt")
    feature_list.append(data)

# Extract mean values
feature_means = {}
for i in range(len(sensor_names)):
    feature_means[sensor_names[i]] = feature_list[i].mean(axis=1)

# Create features dataframe
df_features = pd.DataFrame(feature_means)

# Import target labels
target = np.genfromtxt(r"profile.txt")
df_targets = pd.DataFrame(target, columns=[
    "Cooler_Condition",
    "Valve_Condition",
    "Internal_Pump_Leakage",
    "Hydraulic_Accumulator",
    "Stable_Flag"
])

# Combine features and targets
df_final = pd.concat([df_features, df_targets], axis=1)

print(f"‚úì Data loaded successfully!")
print(f"\nDataset shape: {df_final.shape}")
print(f"Number of samples: {len(df_final)}")
print(f"Number of features: {len(sensor_names)}")

# Prepare features and targets
X = df_final.iloc[:, :-5]
targets = {
    'Cooler_Condition': df_final['Cooler_Condition'].astype(int),
    'Valve_Condition': df_final['Valve_Condition'].astype(int),
    'Internal_Pump_Leakage': df_final['Internal_Pump_Leakage'].astype(int),
    'Hydraulic_Accumulator': df_final['Hydraulic_Accumulator'].astype(int),
    'Stable_Flag': df_final['Stable_Flag'].astype(int)
}

# ============================================================================
# 2. HELPER FUNCTIONS
# ============================================================================

def evaluate_model(model, X_train, X_test, y_train, y_test, target_name):
    """Comprehensive model evaluation"""

    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Calculate metrics
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    train_precision = precision_score(y_train, y_pred_train, average='weighted', zero_division=0)
    test_precision = precision_score(y_test, y_pred_test, average='weighted', zero_division=0)
    train_recall = recall_score(y_train, y_pred_train, average='weighted', zero_division=0)
    test_recall = recall_score(y_test, y_pred_test, average='weighted', zero_division=0)
    train_f1 = f1_score(y_train, y_pred_train, average='weighted', zero_division=0)
    test_f1 = f1_score(y_test, y_pred_test, average='weighted', zero_division=0)

    print(f"\n{'='*70}")
    print(f"RESULTS FOR: {target_name}")
    print(f"{'='*70}")
    print(f"\nTraining Set Metrics:")
    print(f"  Accuracy:  {train_accuracy:.4f}")
    print(f"  Precision: {train_precision:.4f}")
    print(f"  Recall:    {train_recall:.4f}")
    print(f"  F1-Score:  {train_f1:.4f}")

    print(f"\nTest Set Metrics:")
    print(f"  Accuracy:  {test_accuracy:.4f}")
    print(f"  Precision: {test_precision:.4f}")
    print(f"  Recall:    {test_recall:.4f}")
    print(f"  F1-Score:  {test_f1:.4f}")

    # Check for overfitting
    overfit_gap = train_accuracy - test_accuracy
    if overfit_gap > 0.1:
        print(f"\n‚ö†Ô∏è  Warning: Possible overfitting detected (gap: {overfit_gap:.4f})")
    else:
        print(f"\n‚úì Excellent generalization (gap: {overfit_gap:.4f})")

    return {
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'train_precision': train_precision,
        'test_precision': test_precision,
        'predictions': y_pred_test,
        'overfit_gap': overfit_gap
    }

def plot_confusion_matrix(y_true, y_pred, target_name):
    """Plot confusion matrix"""
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
                square=True, linewidths=1, linecolor='white')
    plt.title(f'Confusion Matrix - {target_name}', fontsize=14, fontweight='bold')
    plt.ylabel('True Label', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.tight_layout()
    filename = f'rf_confusion_matrix_{target_name.replace(" ", "_")}.png'
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  ‚úì Confusion matrix saved as '{filename}'")

def plot_feature_importance(model, feature_names, target_name, top_n=15):
    """Plot feature importance with improved visualization"""
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1][:top_n]

    plt.figure(figsize=(12, 7))
    colors = plt.cm.viridis(np.linspace(0.3, 0.9, top_n))
    bars = plt.barh(range(top_n), importances[indices], color=colors)
    plt.yticks(range(top_n), [feature_names[i] for i in indices])
    plt.xlabel('Importance Score', fontsize=12)
    plt.ylabel('Features', fontsize=12)
    plt.title(f'Top {top_n} Feature Importances - {target_name}',
              fontsize=14, fontweight='bold')
    plt.gca().invert_yaxis()

    # Add value labels on bars
    for i, bar in enumerate(bars):
        width = bar.get_width()
        plt.text(width, bar.get_y() + bar.get_height()/2,
                f'{width:.4f}', ha='left', va='center', fontsize=9)

    plt.tight_layout()
    filename = f'rf_feature_importance_{target_name.replace(" ", "_")}.png'
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.close()
    print(f"  ‚úì Feature importance saved as '{filename}'")

    print(f"\n  Top {min(top_n, 10)} Most Important Features:")
    for i, idx in enumerate(indices[:10], 1):
        print(f"    {i:2d}. {feature_names[idx]:5s}: {importances[idx]:.4f}")

def plot_model_comparison(results_summary):
    """Compare performance across all models"""
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('Random Forest Model Performance Comparison',
                 fontsize=16, fontweight='bold')

    targets = list(results_summary.keys())
    train_acc = [results_summary[t]['train_accuracy'] for t in targets]
    test_acc = [results_summary[t]['test_accuracy'] for t in targets]
    train_prec = [results_summary[t]['train_precision'] for t in targets]
    test_prec = [results_summary[t]['test_precision'] for t in targets]

    # Accuracy comparison
    x = np.arange(len(targets))
    width = 0.35
    axes[0, 0].bar(x - width/2, train_acc, width, label='Train', color='skyblue')
    axes[0, 0].bar(x + width/2, test_acc, width, label='Test', color='coral')
    axes[0, 0].set_ylabel('Accuracy')
    axes[0, 0].set_title('Accuracy: Train vs Test')
    axes[0, 0].set_xticks(x)
    axes[0, 0].set_xticklabels([t.replace('_', '\n') for t in targets], fontsize=8)
    axes[0, 0].legend()
    axes[0, 0].set_ylim([0.8, 1.05])

    # Precision comparison
    axes[0, 1].bar(x - width/2, train_prec, width, label='Train', color='lightgreen')
    axes[0, 1].bar(x + width/2, test_prec, width, label='Test', color='salmon')
    axes[0, 1].set_ylabel('Precision')
    axes[0, 1].set_title('Precision: Train vs Test')
    axes[0, 1].set_xticks(x)
    axes[0, 1].set_xticklabels([t.replace('_', '\n') for t in targets], fontsize=8)
    axes[0, 1].legend()
    axes[0, 1].set_ylim([0.8, 1.05])

    # Test accuracy ranking
    sorted_targets = sorted(targets, key=lambda t: results_summary[t]['test_accuracy'], reverse=True)
    sorted_acc = [results_summary[t]['test_accuracy'] for t in sorted_targets]
    axes[1, 0].barh(range(len(sorted_targets)), sorted_acc, color='mediumpurple')
    axes[1, 0].set_yticks(range(len(sorted_targets)))
    axes[1, 0].set_yticklabels([t.replace('_', ' ') for t in sorted_targets], fontsize=9)
    axes[1, 0].set_xlabel('Test Accuracy')
    axes[1, 0].set_title('Models Ranked by Test Accuracy')
    axes[1, 0].set_xlim([0.8, 1.05])

    # Overfitting gap
    gaps = [results_summary[t]['overfit_gap'] for t in targets]
    colors = ['green' if g < 0.05 else 'orange' if g < 0.1 else 'red' for g in gaps]
    axes[1, 1].bar(x, gaps, color=colors, alpha=0.7)
    axes[1, 1].set_ylabel('Overfitting Gap (Train - Test)')
    axes[1, 1].set_title('Overfitting Analysis')
    axes[1, 1].set_xticks(x)
    axes[1, 1].set_xticklabels([t.replace('_', '\n') for t in targets], fontsize=8)
    axes[1, 1].axhline(y=0.05, color='orange', linestyle='--', label='Warning threshold')
    axes[1, 1].axhline(y=0.1, color='red', linestyle='--', label='Critical threshold')
    axes[1, 1].legend(fontsize=8)

    plt.tight_layout()
    plt.savefig('rf_model_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("\n‚úì Model comparison chart saved as 'rf_model_comparison.png'")

# ============================================================================
# 3. TRAIN RANDOM FOREST MODELS FOR ALL TARGETS
# ============================================================================

results_summary = {}

for target_name, y in targets.items():

    print(f"\n\n{'#'*70}")
    print(f"# TRAINING MODEL FOR: {target_name}")
    print(f"{'#'*70}")

    # Check class distribution
    print(f"\nClass distribution:")
    class_dist = y.value_counts().sort_index()
    print(class_dist)

    # Split data with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1, stratify=y
    )

    # ========================================================================
    # RANDOM FOREST MODEL
    # ========================================================================

    print(f"\n{'‚îÄ'*70}")
    print("Training Random Forest Model...")
    print(f"{'‚îÄ'*70}")

    # Create Random Forest classifier
    rf_model = RandomForestClassifier(
        n_estimators=100,          # Number of trees in the forest
        max_depth=15,              # Maximum depth of each tree
        min_samples_split=10,      # Minimum samples to split a node
        min_samples_leaf=5,        # Minimum samples at leaf node
        max_features='sqrt',       # Number of features per tree
        bootstrap=True,            # Use bootstrap samples
        oob_score=True,            # Out-of-bag score estimation
        n_jobs=-1,                 # Use all CPU cores
        random_state=1,
        verbose=0
    )

    # Train the model
    rf_model.fit(X_train, y_train)
    print("‚úì Training completed!")

    # Out-of-bag score (another validation metric)
    print(f"  Out-of-Bag Score: {rf_model.oob_score_:.4f}")

    # Evaluate model
    results = evaluate_model(rf_model, X_train, X_test, y_train, y_test, target_name)
    results_summary[target_name] = results

    # Cross-validation
    print(f"\n{'‚îÄ'*70}")
    print("Cross-Validation (5-fold):")
    print(f"{'‚îÄ'*70}")
    cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5,
                                 scoring='accuracy', n_jobs=-1)
    print(f"  CV Accuracy Scores: {[f'{s:.4f}' for s in cv_scores]}")
    print(f"  Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

    # Visualizations
    print(f"\n{'‚îÄ'*70}")
    print("Generating Visualizations...")
    print(f"{'‚îÄ'*70}")

    # 1. Confusion Matrix
    plot_confusion_matrix(y_test, results['predictions'], target_name)

    # 2. Feature Importance
    plot_feature_importance(rf_model, X.columns.tolist(), target_name, top_n=15)

    # Classification Report
    print(f"\n{'‚îÄ'*70}")
    print("Detailed Classification Report:")
    print(f"{'‚îÄ'*70}")
    print(classification_report(y_test, results['predictions'], zero_division=0))

    # Model complexity metrics
    print(f"\n{'‚îÄ'*70}")
    print("Model Complexity:")
    print(f"{'‚îÄ'*70}")
    print(f"  Number of Trees: {rf_model.n_estimators}")
    print(f"  Average Tree Depth: {np.mean([tree.get_depth() for tree in rf_model.estimators_]):.1f}")
    print(f"  Average Leaves per Tree: {np.mean([tree.get_n_leaves() for tree in rf_model.estimators_]):.1f}")
    print(f"  Total Features Used: {X.shape[1]}")

# ============================================================================
# 4. SUMMARY OF ALL MODELS
# ============================================================================

print(f"\n\n{'='*70}")
print("OVERALL SUMMARY - ALL TARGETS")
print(f"{'='*70}\n")

summary_df = pd.DataFrame({
    target: {
        'Train Acc': f"{results['train_accuracy']:.4f}",
        'Test Acc': f"{results['test_accuracy']:.4f}",
        'Train Prec': f"{results['train_precision']:.4f}",
        'Test Prec': f"{results['test_precision']:.4f}",
        'Overfit Gap': f"{results['overfit_gap']:.4f}"
    }
    for target, results in results_summary.items()
}).T

print(summary_df)

# Best and worst performing models
print(f"\n{'‚îÄ'*70}")
test_accuracies = {k: v['test_accuracy'] for k, v in results_summary.items()}
best_model = max(test_accuracies, key=test_accuracies.get)
worst_model = min(test_accuracies, key=test_accuracies.get)

print(f"üèÜ Best Performing Model: {best_model} ({test_accuracies[best_model]:.4f})")
print(f"üìä Worst Performing Model: {worst_model} ({test_accuracies[worst_model]:.4f})")
print(f"üìà Average Test Accuracy: {np.mean(list(test_accuracies.values())):.4f}")

# Generate comparison plots
plot_model_comparison(results_summary)

# ============================================================================
# 5. SAVE FINAL MODELS
# ============================================================================

print(f"\n{'‚îÄ'*70}")
print("Saving Models...")
print(f"{'‚îÄ'*70}")

import pickle

for target_name, y in targets.items():
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1, stratify=y
    )

    # Train final model
    final_model = RandomForestClassifier(
        n_estimators=100,
        max_depth=15,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
        bootstrap=True,
        oob_score=True,
        n_jobs=-1,
        random_state=1
    )
    final_model.fit(X_train, y_train)

    # Save model
    filename = f'rf_model_{target_name.replace(" ", "_")}.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(final_model, f)
    print(f"  ‚úì Model saved: {filename}")

print(f"\n{'='*70}")
print("üéâ ALL PROCESSING COMPLETE!")
print(f"{'='*70}")

print(f"\nFiles generated:")
print(f"  ‚Ä¢ 5 confusion matrices (PNG)")
print(f"  ‚Ä¢ 5 feature importance plots (PNG)")
print(f"  ‚Ä¢ 1 model comparison chart (PNG)")
print(f"  ‚Ä¢ 5 trained models (PKL)")

HYDRAULIC SYSTEM CONDITION MONITORING - RANDOM FOREST APPROACH

Loading sensor data...
‚úì Data loaded successfully!

Dataset shape: (2205, 22)
Number of samples: 2205
Number of features: 17


######################################################################
# TRAINING MODEL FOR: Cooler_Condition
######################################################################

Class distribution:
Cooler_Condition
3      732
20     732
100    741
Name: count, dtype: int64

‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
Training Random Forest Model...
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
‚úì Training completed!
  Out-of-Bag Score: 0.9972

RESULTS FOR: Cooler_Co