In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [3]:
# ----------------------------------------------
# Step 2: Build Predictive Models
# ----------------------------------------------

print("Loading the dataset...")
data = pd.read_csv('maintenance_dataset.csv')

# Convert timestamp to datetime
data['timestamp'] = pd.to_datetime(data['timestamp'])

# Add time-based features
data['hour'] = data['timestamp'].dt.hour
data['day_of_week'] = data['timestamp'].dt.dayofweek
data['is_weekend'] = data['timestamp'].dt.dayofweek >= 5

# Create additional features from sensor readings
# Moving averages and standard deviations
for sensor in ['temperature', 'pressure', 'vibration', 'rotation_speed', 'voltage']:
    # Group by machine_id and calculate rolling statistics
    for machine_id in data['machine_id'].unique():
        machine_data = data[data['machine_id'] == machine_id].sort_values('timestamp')
        
        # 24-hour moving average
        data.loc[data['machine_id'] == machine_id, f'{sensor}_24h_avg'] = machine_data[sensor].rolling(24).mean().values
        
        # 24-hour moving standard deviation
        data.loc[data['machine_id'] == machine_id, f'{sensor}_24h_std'] = machine_data[sensor].rolling(24).std().values
        
        # Rate of change (hourly)
        data.loc[data['machine_id'] == machine_id, f'{sensor}_rate'] = machine_data[sensor].diff().values

# Handle NaN values from rolling calculations
data = data.fillna(method='bfill')

data.head()

Loading the dataset...


  data = data.fillna(method='bfill')


Unnamed: 0,timestamp,machine_id,temperature,pressure,vibration,rotation_speed,voltage,anomaly,hour,day_of_week,...,pressure_rate,vibration_24h_avg,vibration_24h_std,vibration_rate,rotation_speed_24h_avg,rotation_speed_24h_std,rotation_speed_rate,voltage_24h_avg,voltage_24h_std,voltage_rate
0,2025-01-01 00:00:00,1,59.024442,109.91001,0.525892,999.480285,215.190201,0,0,2,...,-1.373311,0.570318,0.086401,0.062491,1015.753703,16.040709,28.857834,217.573129,3.5897,1.759121
1,2025-01-01 01:00:00,1,60.292482,108.536698,0.588383,1028.338119,216.949321,0,1,2,...,-1.373311,0.570318,0.086401,0.062491,1015.753703,16.040709,28.857834,217.573129,3.5897,1.759121
2,2025-01-01 02:00:00,1,59.202351,106.758743,0.65191,1026.508565,220.355828,0,2,2,...,-1.777955,0.570318,0.086401,0.063527,1015.753703,16.040709,-1.829554,217.573129,3.5897,3.406507
3,2025-01-01 03:00:00,1,59.691122,106.364979,0.645014,1050.524592,221.593355,0,3,2,...,-0.393763,0.570318,0.086401,-0.006897,1015.753703,16.040709,24.016027,217.573129,3.5897,1.237526
4,2025-01-01 04:00:00,1,59.981519,104.132838,0.733439,1029.35102,222.242525,0,4,2,...,-2.232141,0.570318,0.086401,0.088426,1015.753703,16.040709,-21.173572,217.573129,3.5897,0.64917


In [4]:
data.shape

(10800, 26)

In [5]:
# Select features for modeling
features = ['temperature', 'pressure', 'vibration', 'rotation_speed', 'voltage',
            'hour', 'day_of_week', 'is_weekend',
            'temperature_24h_avg', 'temperature_24h_std', 'temperature_rate',
            'pressure_24h_avg', 'pressure_24h_std', 'pressure_rate',
            'vibration_24h_avg', 'vibration_24h_std', 'vibration_rate',
            'rotation_speed_24h_avg', 'rotation_speed_24h_std', 'rotation_speed_rate',
            'voltage_24h_avg', 'voltage_24h_std', 'voltage_rate']

X = data[features]
y = data['anomaly']

# Split the data - Keep the test set for final evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
# ----------------------------------------------
# Model 1: Isolation Forest for Unsupervised Anomaly Detection
# ----------------------------------------------
isolation_forest = IsolationForest(contamination=0.05, random_state=42)
isolation_forest.fit(X_train_scaled)

# Score the training data
data_train = X_train.copy()
data_train['anomaly_true'] = y_train
data_train['anomaly_score'] = isolation_forest.decision_function(X_train_scaled)
data_train['anomaly_pred_if'] = isolation_forest.predict(X_train_scaled)
# Convert Isolation Forest predictions from {1: normal, -1: anomaly} to {0: normal, 1: anomaly}
data_train['anomaly_pred_if'] = np.where(data_train['anomaly_pred_if'] == -1, 1, 0)

print("\nIsolation Forest Results on Training Data:")
print(confusion_matrix(data_train['anomaly_true'], data_train['anomaly_pred_if']))
print(classification_report(data_train['anomaly_true'], data_train['anomaly_pred_if']))


Isolation Forest Results on Training Data:
[[6783  220]
 [ 399  158]]
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      7003
           1       0.42      0.28      0.34       557

    accuracy                           0.92      7560
   macro avg       0.68      0.63      0.65      7560
weighted avg       0.91      0.92      0.91      7560



In [7]:
# ----------------------------------------------
# Model 2: Random Forest for Supervised Anomaly Detection
# ----------------------------------------------
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# Evaluate the random forest
y_pred_rf = rf_classifier.predict(X_test_scaled)
print("\nRandom Forest Results on Test Data:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': rf_classifier.feature_importances_
})
feature_importance = feature_importance.sort_values('Importance', ascending=False)
print("\nTop 10 most important features:")
print(feature_importance.head(10))



Random Forest Results on Test Data:
[[3000    2]
 [   1  237]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3002
           1       0.99      1.00      0.99       238

    accuracy                           1.00      3240
   macro avg       1.00      1.00      1.00      3240
weighted avg       1.00      1.00      1.00      3240


Top 10 most important features:
                Feature  Importance
2             vibration    0.408735
14    vibration_24h_avg    0.179509
1              pressure    0.096648
0           temperature    0.048917
8   temperature_24h_avg    0.047891
11     pressure_24h_avg    0.044027
20      voltage_24h_avg    0.026643
4               voltage    0.024891
12     pressure_24h_std    0.020304
3        rotation_speed    0.015926


In [8]:

# Save the models and scaler
print("\nSaving models...")
joblib.dump(isolation_forest, 'isolation_forest_model.pkl')
joblib.dump(rf_classifier, 'random_forest_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print("Models saved successfully!")

# Save some test data for the app
test_sample = data.sample(n=1000, random_state=42)
test_sample.to_csv('test_sample.csv', index=False)


Saving models...
Models saved successfully!
