In [7]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler


# 1. Load the dataset
data_df = pd.read_csv('DeC4_data.csv')

def add_scaled_features(df):
    """
    Create engineered process features and scale selected columns using MinMaxScaler.

    This function generates several derived features from key process variables
    (such as temperature, pressure, and reflux flow) and scales them to the [0, 1] range
    for improved model performance and comparability.

    Parameters
    ----------
    df : pandas.DataFrame
        Input DataFrame containing process variables such as:
        - 'top Pressure'
        - 'top Temperature'
        - '6th tray Temperature'
        - 'bottom Temperature 1'
        - 'bottom Temperature 2'
        - 'reflux flow'
        - 'flow to next process'

    Returns
    -------
    tuple
        (df_scaled, cols_to_scale)
        - **df_scaled** : pandas.DataFrame  
          DataFrame including the original columns plus new engineered and scaled features.
        - **cols_to_scale** : list of str  
          List of columns that were scaled using MinMaxScaler.

    Notes
    -----
    - A small constant (+1) is added to denominators to prevent division by zero.
    - The scaling is applied only to the engineered feature set, not the entire DataFrame.
    - The generated features include:
        * `PT` - Pressure * Temperature  
        * `dT` - Ratio of top to 6th tray temperature  
        * `dT2` - Ratio of top to bottom temperature  
        * `Refux Ratio` - Reflux flow divided by flow to next process  
        * `Log_top_T` - Log-transformed top temperature  
        * `reflux_times_Ttray6` - Interaction of reflux ratio and 6th tray temperature  
        * `T_all_avg` - Average of all key temperature measurements


    """
    df = df.copy()
    
    # Create new features
    df['PT'] = df['top Pressure'] * df['top Temperature']
    df['dT'] = df['top Temperature'] / (df['6th tray Temperature'] +1) 
    df['dT2'] = df['top Temperature'] / (df['bottom Temperature 1'] +1)
    df['Refux Ratio'] = df['reflux flow'] / (df['flow to next process'] + 1)
    df['Log_top_T'] = np.log1p(1+df['top Temperature'])
    df['reflux_times_Ttray6'] = np.log1p(df['Refux Ratio']) * df['6th tray Temperature']
    df['T_all_avg'] = (df['top Temperature'] + df['6th tray Temperature'] +
                       df['bottom Temperature 1'] + df['bottom Temperature 2']) / 4    


    # Columns to scale
    cols_to_scale = ['PT', 'dT', 'dT2', 'Refux Ratio','Log_top_T','reflux_times_Ttray6','T_all_avg']
    
    # Apply MinMaxScaler
    scaler = MinMaxScaler()
    df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
    
    return df,cols_to_scale

# 2. Add Scaled New Features
data_df,create_columns= add_scaled_features(data_df)


In [8]:
data_df.columns

Index(['top Temperature', 'top Pressure', 'reflux flow',
       'flow to next process', '6th tray Temperature', 'bottom Temperature 1',
       'bottom Temperature 2', 'C4 content', 'PT', 'dT', 'dT2', 'Refux Ratio',
       'Log_top_T', 'reflux_times_Ttray6', 'T_all_avg'],
      dtype='object')

In [9]:
def make_lagged_df(df, lag, columns_to_drop, target_col):
    """
    Create a lagged version of a DataFrame for time series or sequential modeling.

    This function shifts all numeric (or relevant) columns in the input DataFrame by a specified 
    lag period, removes unwanted columns, and appends the original target column for supervised learning.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame containing time-series or sequential data.
    lag : int
        The number of time steps to shift the data. If 0, the function returns a copy of the original DataFrame.
    columns_to_drop : list of str
        A list of column names to remove before creating lagged features (e.g., identifiers or non-numeric columns).
    target_col : str
        The name of the target column to retain (not lagged) in the resulting DataFrame.

    Returns
    -------
    pandas.DataFrame
        A new DataFrame with lagged features (columns suffixed with `_lag{lag}`) and the original target column.
        Rows containing NaN values introduced by the lag operation are dropped.

    Notes
    -----
    - The target column is **not lagged**; it remains aligned with the current row for prediction tasks.
    - Columns listed in `columns_to_drop` are excluded from lagging.
    - Use lag > 0 to create past-feature relationships for supervised learning models.

    Example
    -------
    df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [10, 20, 30, 40], 'y': [0, 1, 0, 1]})
    make_lagged_df(df, lag=1, columns_to_drop=[], target_col='y')
       A_lag1  B_lag1  y
    1     1.0    10.0  1
    2     2.0    20.0  0
    3     3.0    30.0  1
    """
    if lag == 0:
        return df.copy()
    df = df.drop(columns=columns_to_drop)
    lagged = df.shift(lag).add_suffix(f"_lag{lag}")
    lagged = lagged.drop(columns=[f"{target_col}_lag{lag}"], errors="ignore")
    lagged[target_col] = df[target_col]
    return lagged.dropna()
# 3. Create the best Lagged Dataset
init_lag = make_lagged_df(data_df,13,[],'C4 content')

In [10]:
init_lag.head()

Unnamed: 0,top Temperature_lag13,top Pressure_lag13,reflux flow_lag13,flow to next process_lag13,6th tray Temperature_lag13,bottom Temperature 1_lag13,bottom Temperature 2_lag13,PT_lag13,dT_lag13,dT2_lag13,Refux Ratio_lag13,Log_top_T_lag13,reflux_times_Ttray6_lag13,T_all_avg_lag13,C4 content
13,0.2689,0.650894,0.832742,0.58342,0.784759,0.843079,0.822079,0.307679,0.26738,0.215891,0.697775,0.31112,0.776996,0.750634,0.157039
14,0.268483,0.65014,0.852153,0.57751,0.776487,0.838605,0.822079,0.306845,0.268208,0.21608,0.716716,0.310665,0.785747,0.74659,0.159158
15,0.267967,0.659657,0.823618,0.5716,0.764546,0.807879,0.786246,0.310738,0.269504,0.21933,0.695321,0.310104,0.75481,0.722315,0.163836
16,0.267451,0.668338,0.808371,0.565689,0.752605,0.799606,0.786246,0.314222,0.270818,0.219915,0.685025,0.309543,0.73402,0.715947,0.167007
17,0.266935,0.647191,0.761948,0.559779,0.745326,0.773122,0.746142,0.303693,0.271423,0.222769,0.648133,0.308982,0.694601,0.693095,0.172509


In [None]:
#4. Save Dataset
init_lag.to_csv('df_lagged.csv',index=False)