In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import  os # Charger train

Base_path=r'...................Engine\data'
processed_path = os.path.join(Base_path, 'processed')

#print(os.listdir(Base_path))   # pour vérifier

train_df=pd.read_csv(os.path.join(processed_path, "train_df_clean.csv"),index_col=False)
test_df=pd.read_csv(os.path.join(processed_path, "test_df_clean.csv"),index_col=False)
df_train=train_df.copy()


#  Compute RUL_raw 

In [16]:
# 2. Compute RUL_raw (run-to-failure remaining life for each row)

# For each engine, find its last observed cycle (failure point in the training set)
max_cycle_per_engine = (
    df_train
    .groupby('unit_nr')['time_cycles']
    .max()
    .reset_index()
    .rename(columns={'time_cycles': 'max_cycle'})
)

# Merge the max_cycle value back into the training dataframe
df_train = df_train.merge(max_cycle_per_engine, on='unit_nr', how='left')

# Raw RUL = max_cycle - current cycle index
df_train['RUL_raw'] = df_train['max_cycle'] - df_train['time_cycles']

# Quick sanity check
print(df_train[['unit_nr', 'time_cycles', 'max_cycle', 'RUL_raw']].head())


   unit_nr  time_cycles  max_cycle  RUL_raw
0        1            1        192      191
1        1            2        192      190
2        1            3        192      189
3        1            4        192      188
4        1            5        192      187


# Apply capped RUL_target = min(RUL_raw, 125)

In [22]:


# 3. Apply capped RUL_target = min(RUL_raw, 125)
RUL_CAP = 125  # hyperparameter (knee point)

# We assume df_final already contains the raw RUL column named 'RUL'
df_train['RUL'] = df_train['RUL'].clip(upper=RUL_CAP)



In [23]:
df_train

Unnamed: 0,unit_nr,time_cycles,s_2,s_3,s_4,s_7,s_8,s_9,s_11,s_12,s_13,s_14,s_15,s_17,s_20,s_21,RUL,max
0,1,1,641.82,1589.70,1400.60,554.36,2388.06,9046.19,47.47,521.66,2388.02,8138.62,8.4195,392,39.06,23.4190,125,192
1,1,2,642.15,1591.82,1403.14,553.75,2388.04,9044.07,47.49,522.28,2388.07,8131.49,8.4318,392,39.00,23.4236,125,192
2,1,3,642.35,1587.99,1404.20,554.26,2388.08,9052.94,47.27,522.42,2388.03,8133.23,8.4178,390,38.95,23.3442,125,192
3,1,4,642.35,1582.79,1401.87,554.45,2388.11,9049.48,47.13,522.86,2388.08,8133.83,8.3682,392,38.88,23.3739,125,192
4,1,5,642.37,1582.85,1406.22,554.00,2388.06,9055.15,47.28,522.19,2388.04,8133.80,8.4294,393,38.90,23.4044,125,192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20626,100,196,643.49,1597.98,1428.63,551.43,2388.19,9065.52,48.07,519.49,2388.26,8137.60,8.4956,397,38.49,22.9735,4,200
20627,100,197,643.54,1604.50,1433.58,550.86,2388.23,9065.11,48.04,519.68,2388.22,8136.50,8.5139,395,38.30,23.1594,3,200
20628,100,198,643.42,1602.46,1428.18,550.94,2388.24,9065.90,48.09,520.01,2388.24,8141.05,8.5646,398,38.44,22.9333,2,200
20629,100,199,643.23,1605.26,1426.53,550.68,2388.25,9073.72,48.39,519.67,2388.23,8139.29,8.5389,395,38.29,23.0640,1,200


# Scale features using MinMaxScaler((-1, 1)) 

In [24]:
from sklearn.preprocessing import MinMaxScaler


# 4. Feature Scaling with MinMaxScaler(-1, 1)
# ------------------------------------------

# List of the 14 selected sensors (kept after variance + correlation analysis)
sensors_to_keep = [
    's_2', 's_3', 's_4', 's_7', 's_8', 
    's_9', 's_11', 's_12', 's_13', 's_14', 
    's_15', 's_17', 's_20', 's_21'
]

# Columns to normalize (sensor features only)
cols_normalize = sensors_to_keep

# Work on copies to avoid modifying the raw DataFrames
train_df_norm = df_train.copy()
test_df_norm  = test_df.copy()

# 1. Create MinMax scaler in the range (-1, 1)
scaler = MinMaxScaler(feature_range=(-1, 1))

# 2. Fit scaler on TRAIN only (to avoid data leakage)
train_df_norm[cols_normalize] = scaler.fit_transform(train_df_norm[cols_normalize])

# 3. Apply the same scaler to TEST (uses train's min/max statistics)
test_df_norm[cols_normalize] = scaler.transform(test_df_norm[cols_normalize])


print("\n--- NORMALIZATION CHECK ---")
print("Train min (expected ≈ -1):", train_df_norm[cols_normalize].min().min())
print("Train max (expected ≈  1):", train_df_norm[cols_normalize].max().max())

train_df_norm.head()


--- NORMALIZATION CHECK ---
Train min (expected ≈ -1): -1.0
Train max (expected ≈  1): 1.0


Unnamed: 0,unit_nr,time_cycles,s_2,s_3,s_4,s_7,s_8,s_9,s_11,s_12,s_13,s_14,s_15,s_17,s_20,s_21,RUL,max
0,1,1,-0.63253,-0.186396,-0.380486,0.452496,-0.515152,-0.78049,-0.261905,0.266525,-0.588235,-0.600784,-0.272028,-0.333333,0.426357,0.449323,125,192
1,1,2,-0.433735,-0.093961,-0.294733,0.256039,-0.575758,-0.799515,-0.238095,0.530917,-0.441176,-0.674373,-0.177376,-0.333333,0.333333,0.462027,125,192
2,1,3,-0.313253,-0.260955,-0.258947,0.42029,-0.454545,-0.719914,-0.5,0.590618,-0.558824,-0.656414,-0.28511,-0.666667,0.255814,0.242751,125,192
3,1,4,-0.313253,-0.487683,-0.33761,0.481481,-0.363636,-0.750965,-0.666667,0.778252,-0.411765,-0.650222,-0.666795,-0.333333,0.147287,0.324772,125,192
4,1,5,-0.301205,-0.485066,-0.190749,0.336554,-0.515152,-0.700081,-0.488095,0.492537,-0.529412,-0.650532,-0.195845,-0.166667,0.178295,0.409003,125,192


# Generate sliding-window sequences

In [6]:
import numpy as np

def create_sequences(df, seq_length, feature_cols, target_col):
   
    X = []
    y = []

    # 1. Iterate engine by engine
    for unit in df['unit_nr'].unique():
        
        # Select data for a single engine
        df_engine = df[df['unit_nr'] == unit]
        
        # Extract features and target as numpy arrays
        data_x = df_engine[feature_cols].values   # shape: (T, n_features)
        data_y = df_engine[target_col].values     # shape: (T,)
        
        # 2. Number of possible windows for this engine
        # Example: if T = 192 and seq_length = 30 → 192 - 30 + 1 windows
        num_samples = len(df_engine) - seq_length + 1
        
        # Skip engines that are too short
        if num_samples <= 0:
            continue
        
        # 3. Sliding-window construction
        for i in range(num_samples):
            # Take the window [i, i+seq_length)
            window = data_x[i : i + seq_length]
            
            # Target is the RUL at the last time step of the window
            target = data_y[i + seq_length - 1]
            
            X.append(window)
            y.append(target)
    
    return np.array(X), np.array(y)


# Reminder: sensors_to_keep contains the 14 selected sensors (s_2, s_3, ...)
# We only pass sensor features to the model, not 'unit_nr' or 'time_cycles'.
sequence_length = 30

print("Generating training sequences...")

# Use the normalized training dataframe (train_df_norm) from the scaling step
X_train, y_train = create_sequences(
    train_df_norm,
    seq_length=sequence_length,
    feature_cols=sensors_to_keep,
    target_col='RUL'   # capped RUL_target
)

print("\n--- SEQUENCE GENERATION SUMMARY ---")
print(f"X_train shape (N, W, F): {X_train.shape}")
print(f"y_train shape (N,)    : {y_train.shape}")


Génération des séquences en cours... (Patience)

--- RÉSULTAT DE LA TRANSFORMATION ---
Forme de X_train (Le Cube)  : (17731, 30, 14)
Forme de y_train (La Cible) : (17731,)


In [7]:
def create_test_sequences_last_window(df, seq_length, feature_cols):
   
    X_test_final = []
    
    for unit in df['unit_nr'].unique():
        df_engine = df[df['unit_nr'] == unit]
        data_x = df_engine[feature_cols].values
        
        # Keep only the last seq_length cycles for this engine
        if len(df_engine) >= seq_length:
            last_window = data_x[-seq_length:]
            X_test_final.append(last_window)
        else:
            # Optional: log engines that are too short to form a full window
            print(f"Engine {unit} skipped (only {len(df_engine)} cycles < {seq_length}).")
            
    return np.array(X_test_final)



sequence_length = 30

# Use the normalized test dataframe (test_df_norm) built in the scaling step
X_test_prepared = create_test_sequences_last_window(
    test_df_norm,
    seq_length=sequence_length,
    feature_cols=sensors_to_keep
)

print(f"Final X_test shape for inference: {X_test_prepared.shape}")


Forme de X_test final pour prédiction : (100, 30, 14)


# Prepare final arrays for model training 

In [None]:
import pandas as pd
import numpy as np

y_true_df = pd.read_csv(
    r".............RUL_FD001.txt", 
    sep=" ", 
    header=None
)

y
y_true = y_true_df.iloc[:, 0].values
y_true = y_true_df.drop(1, axis=1)
y_true = y_true.values.flatten()
