In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Load data
# Assuming the data files are in the same directory as this script
train_file = 'train_FD001.txt'
test_file = 'test_FD001.txt'
rul_file = 'RUL_FD001.txt'

# Load training data
train_df = pd.read_csv(train_file, sep=' ', header=None)
train_df.drop(train_df.columns[[26, 27]], axis=1, inplace=True)  # Remove empty columns
train_df.columns = ['id', 'cycle', 'setting1', 'setting2', 'setting3', 's1', 's2', 's3', 's4', 's5', 's6', 's7',
                    's8', 's9', 's10', 's11', 's12', 's13', 's14', 's15', 's16', 's17', 's18', 's19', 's20', 's21']

# Load test data
test_df = pd.read_csv(test_file, sep=' ', header=None)
test_df.drop(test_df.columns[[26, 27]], axis=1, inplace=True)  # Remove empty columns
test_df.columns = train_df.columns  # Use the same column names as training data

# Load RUL data
rul_df = pd.read_csv(rul_file, sep=' ', header=None)
rul_df.drop(rul_df.columns[1], axis=1, inplace=True)  # Remove the second column (not needed)

# Feature engineering
def generate_features(df):
    df['RUL'] = df.groupby('id')['cycle'].transform(max) - df['cycle']
    return df

train_df = generate_features(train_df)
test_df = generate_features(test_df)

# Drop columns not needed for modeling
drop_cols = ['id', 'cycle', 'setting3', 's1', 's10', 's18', 's19']
train_df.drop(drop_cols, axis=1, inplace=True)
test_df.drop(drop_cols, axis=1, inplace=True)

# Normalize or standardize data if necessary (here using StandardScaler)
scaler = StandardScaler()
train_df[train_df.columns] = scaler.fit_transform(train_df[train_df.columns])
test_df[test_df.columns] = scaler.transform(test_df[test_df.columns])

# Split data into training and testing sets
X_train = train_df.drop('RUL', axis=1)
y_train = train_df['RUL']
X_test = test_df.drop('RUL', axis=1)
y_test = test_df['RUL']

# Model selection and training
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)

    # Cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    rmse = np.sqrt(-scores.mean())
    print(f"{name} RMSE: {rmse}")

# Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"Test RMSE: {rmse}")

# Evaluate models on test set
for name, model in models.items():
    print(f"Evaluating {name} on test set...")
    evaluate_model(model, X_test, y_test)




  df['RUL'] = df.groupby('id')['cycle'].transform(max) - df['cycle']
  df['RUL'] = df.groupby('id')['cycle'].transform(max) - df['cycle']


Training Linear Regression...
Linear Regression RMSE: 0.6579185097638631
Training Random Forest...
Random Forest RMSE: 0.6346651548890893
Training Gradient Boosting...
Gradient Boosting RMSE: 0.6266988851641903
Evaluating Linear Regression on test set...
Test RMSE: 1.1145191925246791
Evaluating Random Forest on test set...
Test RMSE: 1.1841171848253147
Evaluating Gradient Boosting on test set...
Test RMSE: 1.166507620443101
