In [2]:
# Import Essential Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Processing and Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Hugging Face and NLP Libraries
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import torch.optim as optim

In [15]:
# 1. Data Loading
# Load Kaggle Machine Failure Dataset
sensor_data = pd.read_csv(r"C:\Users\ishir\Enhancing-Predictive-Maintenance-with-Multi-Source-Data-Integration\sensor_dataset\data.csv")

# Load FabNER dataset from Hugging Face
fabner_dataset = load_dataset('DFKI-SLT/fabner')

sensor_data.head()

Unnamed: 0,footfall,tempMode,AQ,USS,CS,VOC,RP,IP,Temperature,fail
0,0,7,7,1,6,6,36,3,1,1
1,190,1,3,3,5,1,20,4,1,0
2,31,7,2,2,6,1,24,6,1,0
3,83,4,3,4,5,1,28,6,1,0
4,640,7,5,6,4,0,68,6,1,0


In [16]:
# 2. Data Exploration and Preprocessing
# Sensor Data Cleaning
print("Sensor Data Info:")
print(sensor_data.info())

# Handle missing values in sensor data
sensor_data.dropna(inplace=True)

Sensor Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 944 entries, 0 to 943
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   footfall     944 non-null    int64
 1   tempMode     944 non-null    int64
 2   AQ           944 non-null    int64
 3   USS          944 non-null    int64
 4   CS           944 non-null    int64
 5   VOC          944 non-null    int64
 6   RP           944 non-null    int64
 7   IP           944 non-null    int64
 8   Temperature  944 non-null    int64
 9   fail         944 non-null    int64
dtypes: int64(10)
memory usage: 73.9 KB
None


In [19]:
# 3. FabNER Dataset Feature Extraction
# Prepare tokenizer and model for feature extraction
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')

def extract_bert_features(texts):
    """
    Extract BERT embeddings for text features using PyTorch
    """
    # Prepare features list
    bert_features = []
    
    for text in texts:
        # Tokenize and encode text
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
        
        # Get BERT embeddings
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        # Use [CLS] token embedding (first token)
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()
        bert_features.append(embeddings[0])
    
    return np.array(bert_features)

# Extract text features from FabNER dataset
fabner_texts = []
for split in ['train', 'validation', 'test']:
    for item in fabner_dataset[split]:
        # Combine all entity texts and original text
        combined_text = ' '.join([
            str(item.get('text', '')),
            ' '.join([str(entity) for entity in item.get('entities', [])])
        ])
        fabner_texts.append(combined_text)

# Extract BERT features
fabner_features = extract_bert_features(fabner_texts)


In [20]:
# 4. Feature Preprocessing
# Numerical Feature Scaling
scaler = StandardScaler()
numerical_features = sensor_data.select_dtypes(include=['float64', 'int64']).columns
sensor_data[numerical_features] = scaler.fit_transform(sensor_data[numerical_features])

In [21]:
# 5. Multi-Modal Data Preparation
# Combine sensor data with FabNER features
X_sensor = sensor_data[numerical_features]
X_fabner = fabner_features[:len(X_sensor)]  # Ensure same length

# Combine sensor and FabNER features
X_combined = np.hstack([X_sensor, X_fabner])

# Prepare target variable
y = sensor_data['failure_label']


KeyError: 'failure_label'

In [None]:
# 6. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42
)


In [None]:

# 7. Multiple Model Approaches

# Approach 1: Random Forest Classifier
rf_classifier = RandomForestClassifier(
    n_estimators=100, 
    random_state=42, 
    n_jobs=-1
)

# Train Random Forest
rf_classifier.fit(X_train, y_train)

# Predictions
rf_pred = rf_classifier.predict(X_test)

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_pred))

# Approach 2: Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(
    n_estimators=100, 
    learning_rate=0.1, 
    random_state=42
)

# Train Gradient Boosting
gb_classifier.fit(X_train, y_train)

# Predictions
gb_pred = gb_classifier.predict(X_test)

print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, gb_pred))

# Approach 3: Support Vector Machine
svm_classifier = SVC(
    kernel='rbf', 
    probability=True, 
    random_state=42
)

# Train SVM
svm_classifier.fit(X_train, y_train)

# Predictions
svm_pred = svm_classifier.predict(X_test)

print("\nSupport Vector Machine Classification Report:")
print(classification_report(y_test, svm_pred))

In [None]:

# 8. Custom PyTorch Neural Network (Alternative to TensorFlow)
class MultiModalNetwork(nn.Module):
    def __init__(self, input_size):
        super(MultiModalNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.network(x)

# PyTorch Model Training
def train_pytorch_model(X_train, y_train, X_test, y_test):
    # Convert to PyTorch tensors
    X_train_tensor = torch.FloatTensor(X_train)
    y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1)
    X_test_tensor = torch.FloatTensor(X_test)
    y_test_tensor = torch.FloatTensor(y_test.values).unsqueeze(1)
    
    # Initialize model
    model = MultiModalNetwork(input_size=X_train.shape[1])
    
    # Loss and Optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Training
    epochs = 100
    for epoch in range(epochs):
        # Forward pass
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        
        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Print progress
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
    
    # Evaluate
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_predictions = (test_outputs > 0.5).float()
        accuracy = (test_predictions == y_test_tensor).float().mean()
        print(f'\nPyTorch Model Test Accuracy: {accuracy.item():.4f}')
    
    return model

# Train PyTorch Model
pytorch_model = train_pytorch_model(X_train, y_train, X_test, y_test)


In [None]:
# 9. Visualization of Model Comparisons
# Create performance comparison plot
plt.figure(figsize=(10, 6))
models = ['Random Forest', 'Gradient Boosting', 'SVM']
accuracies = [
    rf_classifier.score(X_test, y_test),
    gb_classifier.score(X_test, y_test),
    svm_classifier.score(X_test, y_test)
]

plt.bar(models, accuracies)
plt.title('Model Performance Comparison')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.ylim(0, 1)
for i, v in enumerate(accuracies):
    plt.text(i, v, f'{v:.2f}', ha='center', va='bottom')
plt.show()