<a href="https://colab.research.google.com/github/Digvijayapatro/FSM-INT-2023/blob/main/RUL_UPDATED_FINAL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
# Step 1: Load the data from all CSV files in the folders
data_folders = ['b1_1', 'b1_2', 'b2_1', 'b2_2', 'b3_1', 'b3_2']
all_data = []

for folder in data_folders:
    folder_path = os.path.join('/content/drive/MyDrive/bearing/learning', folder)
    csv_files = os.listdir(folder_path)

    for csv_file in csv_files:
        csv_path = os.path.join(folder_path, csv_file)
        df = pd.read_csv(csv_path, header=None)  # Set header=None since we have no feature names
        all_data.append(df)

In [None]:
# Step 2: Concatenate the data into a single DataFrame
full_data = pd.concat(all_data, ignore_index=True)

In [None]:
# Step 3: Preprocess the data
# Assuming your target column is in the last column (column with index -1)
target_column_idx = full_data.shape[1] - 1

# Separate the features and target
X = full_data.drop(columns=[target_column_idx])
y = full_data.iloc[:, target_column_idx]

# If you have missing values in the features, you can handle them using imputation
# For example, fill missing values with the mean of each feature
X.fillna(X.mean(), inplace=True)

# Standardize or normalize the features using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Step 4: Train a machine learning model to predict the remaining useful life
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Step 5: Hyperparameter Tuning
# Initialize a regression model (e.g., RandomForestRegressor)
from sklearn.model_selection import GridSearchCV

# Initialize a regression model (e.g., RandomForestRegressor)
model = RandomForestRegressor(random_state=42)

# Define the hyperparameter search space
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False],
}

# Perform Grid Search with Randomized Parameter Optimization
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    n_jobs=-1,
)

# Fit the Grid Search on the training data
grid_search.fit(X_train, y_train)

# Get the best model and its hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Print the best hyperparameters
print("Best Hyperparameters:")
print(best_params)


In [None]:
  # Step 6: Evaluate the model

# Calculate the Mean Absolute Error (MAE)
mae_best = mean_absolute_error(y_test, y_pred_best)
print(f"Mean Absolute Error (MAE): {mae_best}")

# Calculate the Mean Squared Error (MSE)
mse_best = mean_squared_error(y_test, y_pred_best)
print(f"Mean Squared Error (MSE): {mse_best}")

# Calculate the Root Mean Squared Error (RMSE)
rmse_best = np.sqrt(mse_best)
print(f"Root Mean Squared Error (RMSE): {rmse_best}")

# Calculate R-squared (R²)
r2_best = r2_score(y_test, y_pred_best)
print(f"R-squared (R²): {r2_best}")



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Step 7: Visualization of Model Performance
# Create a scatter plot of actual vs. predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_best, alpha=0.6)
plt.xlabel('Actual Remaining Useful Life')
plt.ylabel('Predicted Remaining Useful Life')
plt.title('Actual vs. Predicted Values')
plt.show()

# Create a residual plot
residuals = y_test - y_pred_best
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True)
plt.xlabel('Residuals')
plt.ylabel('Density')
plt.title('Residual Plot')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Create a line graph of actual vs. predicted values
plt.figure(figsize=(10, 6))
plt.plot(range(len(y_test)), y_test, label='Actual', marker='o')
plt.plot(range(len(y_test)), y_pred_best, label='Predicted', marker='o')
plt.xlabel('Data Point Index')
plt.ylabel('Remaining Useful Life')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Create a histogram to visualize the distribution of actual and predicted values
plt.figure(figsize=(10, 6))
plt.hist(y_test, bins=20, alpha=0.6, label='Actual', color='blue')
plt.hist(y_pred_best, bins=20, alpha=0.6, label='Predicted', color='orange')
plt.xlabel('Remaining Useful Life')
plt.ylabel('Frequency')
plt.title('Distribution of Actual and Predicted Values')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt

residuals = y_test - y_pred_best
plt.scatter(y_pred_best, residuals)
plt.axhline(y=0, color='r', linestyle='dashed')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Example evaluation results for two models (MAE, MSE, RMSE, R-squared)
model_names = ["Model A", "Model B"]
mae_scores = [2.5, 3.0]
mse_scores = [10.0, 12.0]
rmse_scores = [np.sqrt(10.0), np.sqrt(12.0)]
r2_scores = [0.8, 0.75]

# Combine all metric scores into a matrix
metric_scores = np.array([mae_scores, mse_scores, rmse_scores, r2_scores])

# Define the metric names for the columns
metric_names = ["MAE", "MSE", "RMSE", "R-squared"]

# Plotting the matrix as a heatmap
fig, ax = plt.subplots(figsize=(8, 6))
heatmap = ax.imshow(metric_scores, cmap='viridis')

# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(model_names)))
ax.set_yticks(np.arange(len(metric_names)))
ax.set_xticklabels(model_names)
ax.set_yticklabels(metric_names)

# Rotate the x-axis tick labels for better readability
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# Loop over data dimensions and create text annotations (metric values) in the heatmap cells
for i in range(len(metric_names)):
    for j in range(len(model_names)):
        text = ax.text(j, i, f'{metric_scores[i, j]:.2f}', ha="center", va="center", color="w")

ax.set_title("Model Evaluation Metrics")
plt.xlabel("Model")
plt.ylabel("Metric")
plt.colorbar(heatmap, shrink=0.7)
plt.tight_layout()
plt.show()
