In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
file_path = "D:\Internship\Task-2\predictive_maintenance.csv"
data = pd.read_csv(file_path)

# Define the target column
target_column = 'Target'

# Validate the target column
if target_column not in data.columns:
    raise KeyError(f"Column '{target_column}' not found in the dataset.")

# Drop non-relevant columns
data.drop(columns=['UDI', 'Product ID', 'Failure Type'], inplace=True, errors='ignore')

# Rolling statistics for additional features
features_to_roll = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]']
for feature in features_to_roll:
    data[f'{feature}_rolling_mean'] = data[feature].rolling(window=5).mean()
    data[f'{feature}_rolling_std'] = data[feature].rolling(window=5).std()
    data[f'{feature}_rolling_min'] = data[feature].rolling(window=5).min()
    data[f'{feature}_rolling_max'] = data[feature].rolling(window=5).max()

# Rate of change for key features
for feature in features_to_roll:
    data[f'{feature}_rate_of_change'] = data[feature].diff()

# Interaction features
data['torque_speed_interaction'] = data['Torque [Nm]'] * data['Rotational speed [rpm]']
data['temperature_ratio'] = data['Process temperature [K]'] / data['Air temperature [K]']

# Cumulative metrics
data['cumulative_tool_wear'] = data['Tool wear [min]'].cumsum()

# Drop rows with NaN values introduced by rolling and lagging operations
data.dropna(inplace=True)

# Define features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Check and handle non-numeric columns in X
non_numeric_columns = X.select_dtypes(include=['object', 'category']).columns
if len(non_numeric_columns) > 0:
    print(f"Non-numeric columns detected: {non_numeric_columns}")
    # Convert categorical columns to numeric using one-hot encoding
    X = pd.get_dummies(X, columns=non_numeric_columns, drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForest model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predictions and evaluation
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


  file_path = "D:\Internship\Task-2\predictive_maintenance.csv"


Non-numeric columns detected: Index(['Type'], dtype='object')
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1930
           1       0.93      0.60      0.73        70

    accuracy                           0.98      2000
   macro avg       0.96      0.80      0.86      2000
weighted avg       0.98      0.98      0.98      2000

Confusion Matrix:
 [[1927    3]
 [  28   42]]
