In [6]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor

# --------------------------
# CUSTOM TRANSFORMERS
# --------------------------
class IQROutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5):
        self.factor = factor
        self.lower_bound = None
        self.upper_bound = None
        self.mask_ = None
        
    def fit(self, X, y=None):
        q1 = np.percentile(y, 25)
        q3 = np.percentile(y, 75)
        iqr = q3 - q1
        self.lower_bound = q1 - self.factor * iqr
        self.upper_bound = q3 + self.factor * iqr
        self.mask_ = (y >= self.lower_bound) & (y <= self.upper_bound)
        return self
    
    def transform(self, X):
        return X[self.mask_]
    
    def filter_y(self, y):
        return y[self.mask_]

class CategoricalConverter(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for col in self.cat_cols:
            if col in X.columns:
                X[col] = X[col].astype(str)
        return X

# --------------------------
# DATA PREPARATION
# --------------------------
df = pd.read_excel('Raw_Data_v0.xlsx', engine='openpyxl')
df = df.drop(columns=[
    'Ref#', 'Heat treatment', 'Other RM/Rivet/part cost (€/Part)',
    'Gross Weight (g)', 'Other assembled RM/Rivet/part',
])

num_cols = [
    'Annual target quantity',
    'Raw Material Cost (€/kg)',
    'Thickness (mm)',
    'Part Net Weight (g)',
    'Surface Treatment cost (€/Part)',
    'Final Raw Material cost (€/Part)',
    'Heat Treatment cost (€/Part)'
]

cat_cols = [
    'Production',
    'Raw Material Designation',
    'Surface Treatment', 
    'Raw Material'
]

df[num_cols] = df[num_cols].fillna(0)
df[cat_cols] = df[cat_cols].fillna('Missing')

TARGET = 'Total cost with amortization (€/part)'

# --------------------------
# METRICS & PIPELINE HELPERS
# --------------------------
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)
    within_10 = np.mean(np.abs((y_true - y_pred)/y_true) <= 0.1) * 100
    return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 
            'R2': r2, 'Within_10%': within_10}

def df_converter_func(x):
    return pd.DataFrame(x, columns=num_cols + cat_cols)

# --------------------------
# PREPROCESSING STRATEGIES
# --------------------------
base_preproc = Pipeline([
    ('cat_converter', CategoricalConverter(cat_cols)),
    ('column_transformer', ColumnTransformer(
        [('num', RobustScaler(), num_cols)],
        remainder='passthrough',
        verbose_feature_names_out=False
    )),
    ('df_converter', FunctionTransformer(df_converter_func))
])

strategies = {
    'Strategy 1 (IQR+Robust)': {
        'preprocess': Pipeline([
            ('outlier_removal', IQROutlierRemover()),
            ('base_preproc', base_preproc)
        ]),
        'target_transform': None
    },
    'Strategy 2 (IQR+Robust+LogY)': {
        'preprocess': Pipeline([
            ('outlier_removal', IQROutlierRemover()),
            ('base_preproc', base_preproc)
        ]),
        'target_transform': np.log
    },
    'Strategy 3 (LogAll)': {
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('log', FunctionTransformer(np.log), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.log
    },
    'Strategy 4 (LogY+RobustX)': {
        'preprocess': base_preproc,
        'target_transform': np.log
    },
    'Strategy 5 (SqrtAll)': {
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('sqrt', FunctionTransformer(np.sqrt), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.sqrt
    },
    'Strategy 6 (SqrtY+RobustX)': {
        'preprocess': base_preproc,
        'target_transform': np.sqrt
    },
    'Strategy 7 (IQR+Robust+SqrtY)': {
        'preprocess': Pipeline([
            ('outlier_removal', IQROutlierRemover()),
            ('base_preproc', base_preproc)
        ]),
        'target_transform': np.sqrt
    }
}

# --------------------------
# MAIN EXECUTION
# --------------------------
results = []
cat_indices = [i for i, col in enumerate(num_cols + cat_cols) if col in cat_cols]

model = CatBoostRegressor(
    iterations=200,
    learning_rate=0.05,
    depth=4,
    l2_leaf_reg=3,
    cat_features=cat_indices,
    early_stopping_rounds=20,
    verbose=False
)

kf = KFold(n_splits=3, shuffle=True, random_state=42)

for strategy_name, strategy in strategies.items():
    print(f"\n{strategy_name}:")
    fold_metrics = []
    
    for train_idx, test_idx in kf.split(df):
        train_df = df.iloc[train_idx]
        test_df = df.iloc[test_idx]
        
        X_train = train_df.drop(columns=[TARGET])
        y_train = train_df[TARGET].values
        
        # Train-time preprocessing (with potential outlier removal)
        if 'outlier_removal' in strategy['preprocess'].named_steps:
            # Fit and transform training data
            outlier_remover = strategy['preprocess'].named_steps['outlier_removal']
            X_train_filtered = outlier_remover.fit_transform(X_train, y_train)
            y_train_filtered = outlier_remover.filter_y(y_train)
            
            # Apply remaining preprocessing
            base_pipe = strategy['preprocess'].named_steps['base_preproc']
            X_train_processed = base_pipe.fit_transform(X_train_filtered)
        else:
            X_train_processed = strategy['preprocess'].fit_transform(X_train)
            y_train_filtered = y_train
        
        # Model setup
        model_pipe = TransformedTargetRegressor(
            regressor=model,
            func=strategy['target_transform'],
            inverse_func=np.exp if strategy['target_transform'] == np.log else (
                np.square if strategy['target_transform'] == np.sqrt else lambda x: x)
        )
        
        # Training
        model_pipe.fit(X_train_processed, y_train_filtered)
        
        # Test-time preprocessing (without outlier removal)
        if 'base_preproc' in strategy['preprocess'].named_steps:
            test_preprocessor = strategy['preprocess'].named_steps['base_preproc']
        else:
            test_preprocessor = strategy['preprocess']
        
        X_test = test_df.drop(columns=[TARGET])
        X_test_processed = test_preprocessor.transform(X_test)
        y_test = test_df[TARGET].values
        
        # Prediction and evaluation
        y_pred = model_pipe.predict(X_test_processed)
        fold_metrics.append(calculate_metrics(y_test, y_pred))
    
    # Aggregate results
    metrics_df = pd.DataFrame(fold_metrics)
    results.append({
        'Strategy': strategy_name,
        'MAE': f"{metrics_df['MAE'].mean():.2f} ± {metrics_df['MAE'].std():.2f}",
        'RMSE': f"{metrics_df['RMSE'].mean():.2f} ± {metrics_df['RMSE'].std():.2f}",
        'MAPE': f"{metrics_df['MAPE'].mean():.1f}% ± {metrics_df['MAPE'].std():.1f}",
        'R²': f"{metrics_df['R2'].mean():.3f} ± {metrics_df['R2'].std():.3f}",
        'Within ±10%': f"{metrics_df['Within_10%'].mean():.1f}% ± {metrics_df['Within_10%'].std():.1f}"
    })

# --------------------------
# FINAL RESULTS
# --------------------------
results_df = pd.DataFrame(results).set_index('Strategy')
print("\n" + "="*60)
print("Final Comparison Results:")
print("="*60)
display(results_df)



Strategy 1 (IQR+Robust):

Strategy 2 (IQR+Robust+LogY):

Strategy 3 (LogAll):


  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)



Strategy 4 (LogY+RobustX):

Strategy 5 (SqrtAll):

Strategy 6 (SqrtY+RobustX):

Strategy 7 (IQR+Robust+SqrtY):

Final Comparison Results:


Unnamed: 0_level_0,MAE,RMSE,MAPE,R²,Within ±10%
Strategy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Strategy 1 (IQR+Robust),0.21 ± 0.08,0.64 ± 0.21,61.2% ± 17.0,0.079 ± 0.077,18.9% ± 3.3
Strategy 2 (IQR+Robust+LogY),0.21 ± 0.08,0.64 ± 0.20,40.0% ± 5.8,0.072 ± 0.061,23.9% ± 7.6
Strategy 3 (LogAll),0.11 ± 0.05,0.31 ± 0.16,38.1% ± 10.8,0.786 ± 0.116,23.9% ± 3.9
Strategy 4 (LogY+RobustX),0.11 ± 0.05,0.32 ± 0.16,38.4% ± 10.8,0.785 ± 0.116,23.9% ± 3.9
Strategy 5 (SqrtAll),0.09 ± 0.04,0.25 ± 0.12,50.7% ± 19.1,0.862 ± 0.069,21.4% ± 8.9
Strategy 6 (SqrtY+RobustX),0.09 ± 0.04,0.25 ± 0.12,50.6% ± 19.2,0.862 ± 0.069,22.0% ± 9.7
Strategy 7 (IQR+Robust+SqrtY),0.21 ± 0.08,0.64 ± 0.21,45.9% ± 10.6,0.071 ± 0.077,22.0% ± 2.9


In [8]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
from sklearn.utils.validation import check_array

# --------------------------
# TRANSFORMERS
# --------------------------
class IQROutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5):
        self.factor = factor
        self.lower_bound = None
        self.upper_bound = None
        self.mask_ = None

    def fit(self, X, y=None):
        q1 = np.percentile(y, 25)
        q3 = np.percentile(y, 75)
        iqr = q3 - q1
        self.lower_bound = q1 - self.factor * iqr
        self.upper_bound = q3 + self.factor * iqr
        self.mask_ = (y >= self.lower_bound) & (y <= self.upper_bound)
        return self

    def transform(self, X):
        return X[self.mask_]

    def filter_y(self, y):
        return y[self.mask_]

class CategoricalConverter(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        for col in self.cat_cols:
            if col in X.columns:
                X[col] = X[col].astype(str)
        return X

# Safe log and sqrt for features
def safe_log(x):
    x = check_array(x, force_all_finite=False)
    return np.log(np.where(x <= 0, 1e-8, x))

def safe_sqrt(x):
    x = check_array(x, force_all_finite=False)
    return np.sqrt(np.where(x < 0, 0, x))

def df_converter_func(x):
    return pd.DataFrame(x, columns=num_cols + cat_cols)

# --------------------------
# DATA PREPARATION
# --------------------------
df = pd.read_excel('Raw_Data_v0.xlsx', engine='openpyxl')
df = df.drop(columns=[
    'Ref#', 'Heat treatment', 'Other RM/Rivet/part cost (€/Part)',
    'Gross Weight (g)', 'Other assembled RM/Rivet/part',
])

num_cols = [
    'Annual target quantity',
    'Raw Material Cost (€/kg)',
    'Thickness (mm)',
    'Part Net Weight (g)',
    'Surface Treatment cost (€/Part)',
    'Final Raw Material cost (€/Part)',
    'Heat Treatment cost (€/Part)'
]

cat_cols = [
    'Production',
    'Raw Material Designation',
    'Surface Treatment', 
    'Raw Material'
]

df[num_cols] = df[num_cols].fillna(0)
df[cat_cols] = df[cat_cols].fillna('Missing')

TARGET = 'Total cost with amortization (€/part)'

# --------------------------
# METRICS
# --------------------------
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)
    within_10 = np.mean(np.abs((y_true - y_pred)/y_true) <= 0.1) * 100
    return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 
            'R2': r2, 'Within_10%': within_10}

# --------------------------
# STRATEGIES
# --------------------------
strategies = {
    'Strategy 1 (IQR+Robust)': {
        'has_outlier_removal': True,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('num', RobustScaler(), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': None
    },
    'Strategy 2 (IQR+Robust+LogY)': {
        'has_outlier_removal': True,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('num', RobustScaler(), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.log
    },
    'Strategy 3 (LogAll)': {
        'has_outlier_removal': False,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('log', FunctionTransformer(safe_log), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.log
    },
    'Strategy 4 (LogY+RobustX)': {
        'has_outlier_removal': False,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('num', RobustScaler(), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.log
    },
    'Strategy 5 (SqrtAll)': {
        'has_outlier_removal': False,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('sqrt', FunctionTransformer(safe_sqrt), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.sqrt
    },
    'Strategy 6 (SqrtY+RobustX)': {
        'has_outlier_removal': False,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('num', RobustScaler(), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.sqrt
    },
    'Strategy 7 (IQR+Robust+SqrtY)': {
        'has_outlier_removal': True,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('num', RobustScaler(), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.sqrt
    }
}

# --------------------------
# MAIN EXECUTION
# --------------------------
results = []
cat_indices = [i for i, col in enumerate(num_cols + cat_cols) if col in cat_cols]

model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    l2_leaf_reg=1,
    cat_features=cat_indices,
    early_stopping_rounds=20,
    verbose=False
)

kf = KFold(n_splits=3, shuffle=True, random_state=42)
outlier_remover = IQROutlierRemover()

for strategy_name, strategy in strategies.items():
    print(f"\n{strategy_name}:")
    fold_metrics = []
    
    for train_idx, test_idx in kf.split(df):
        train_df = df.iloc[train_idx]
        test_df = df.iloc[test_idx]
        
        X_train = train_df.drop(columns=[TARGET])
        y_train = train_df[TARGET].values
        
        # Apply outlier removal if needed
        if strategy['has_outlier_removal']:
            X_train_filtered = outlier_remover.fit_transform(X_train, y_train)
            y_train_filtered = outlier_remover.filter_y(y_train)
        else:
            X_train_filtered = X_train
            y_train_filtered = y_train
        
        # Fit and transform training data
        X_train_processed = strategy['preprocess'].fit_transform(X_train_filtered)
        
        # Model setup
        model_pipe = TransformedTargetRegressor(
            regressor=model,
            func=strategy['target_transform'],
            inverse_func=np.exp if strategy['target_transform'] == np.log else (
                np.square if strategy['target_transform'] == np.sqrt else lambda x: x)
        )
        
        # Training
        model_pipe.fit(X_train_processed, y_train_filtered)
        
        # Test-time preprocessing (no outlier removal)
        X_test = test_df.drop(columns=[TARGET])
        X_test_processed = strategy['preprocess'].transform(X_test)
        y_test = test_df[TARGET].values
        
        # Prediction and evaluation
        y_pred = model_pipe.predict(X_test_processed)
        fold_metrics.append(calculate_metrics(y_test, y_pred))
    
    # Aggregate results
    metrics_df = pd.DataFrame(fold_metrics)
    results.append({
        'Strategy': strategy_name,
        'MAE': f"{metrics_df['MAE'].mean():.2f} ± {metrics_df['MAE'].std():.2f}",
        'RMSE': f"{metrics_df['RMSE'].mean():.2f} ± {metrics_df['RMSE'].std():.2f}",
        'MAPE': f"{metrics_df['MAPE'].mean():.1f}% ± {metrics_df['MAPE'].std():.1f}",
        'R²': f"{metrics_df['R2'].mean():.3f} ± {metrics_df['R2'].std():.3f}",
        'Within ±10%': f"{metrics_df['Within_10%'].mean():.1f}% ± {metrics_df['Within_10%'].std():.1f}"
    })

# --------------------------
# FINAL RESULTS
# --------------------------
results_df = pd.DataFrame(results).set_index('Strategy')
print("\n" + "="*60)
print("Final Comparison Results:")
print("="*60)
display(results_df)



Strategy 1 (IQR+Robust):

Strategy 2 (IQR+Robust+LogY):

Strategy 3 (LogAll):

Strategy 4 (LogY+RobustX):

Strategy 5 (SqrtAll):

Strategy 6 (SqrtY+RobustX):

Strategy 7 (IQR+Robust+SqrtY):

Final Comparison Results:


Unnamed: 0_level_0,MAE,RMSE,MAPE,R²,Within ±10%
Strategy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Strategy 1 (IQR+Robust),0.22 ± 0.08,0.65 ± 0.20,61.3% ± 25.0,0.048 ± 0.056,22.0% ± 6.6
Strategy 2 (IQR+Robust+LogY),0.22 ± 0.08,0.66 ± 0.20,42.8% ± 5.6,0.021 ± 0.040,24.5% ± 5.7
Strategy 3 (LogAll),0.12 ± 0.06,0.38 ± 0.22,40.0% ± 10.7,0.680 ± 0.211,22.0% ± 5.8
Strategy 4 (LogY+RobustX),0.12 ± 0.06,0.38 ± 0.22,39.9% ± 10.6,0.680 ± 0.211,22.0% ± 6.6
Strategy 5 (SqrtAll),0.10 ± 0.04,0.27 ± 0.11,42.8% ± 12.0,0.845 ± 0.055,25.8% ± 5.4
Strategy 6 (SqrtY+RobustX),0.10 ± 0.04,0.27 ± 0.11,43.1% ± 12.0,0.844 ± 0.056,26.4% ± 6.5
Strategy 7 (IQR+Robust+SqrtY),0.22 ± 0.08,0.65 ± 0.20,45.2% ± 6.9,0.035 ± 0.045,25.8% ± 8.5


In [9]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
from sklearn.utils.validation import check_array

# --------------------------
# TRANSFORMERS
# --------------------------
class IQROutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5):
        self.factor = factor
        self.lower_bound = None
        self.upper_bound = None
        self.mask_ = None

    def fit(self, X, y=None):
        q1 = np.percentile(y, 25)
        q3 = np.percentile(y, 75)
        iqr = q3 - q1
        self.lower_bound = q1 - self.factor * iqr
        self.upper_bound = q3 + self.factor * iqr
        self.mask_ = (y >= self.lower_bound) & (y <= self.upper_bound)
        return self

    def transform(self, X):
        return X[self.mask_]

    def filter_y(self, y):
        return y[self.mask_]

class CategoricalConverter(BaseEstimator, TransformerMixin):
    def __init__(self, cat_cols):
        self.cat_cols = cat_cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        for col in self.cat_cols:
            if col in X.columns:
                X[col] = X[col].astype(str)
        return X

# Safe log and sqrt for features
def safe_log(x):
    x = check_array(x, force_all_finite=False)
    return np.log(np.where(x <= 0, 1e-8, x))

def safe_sqrt(x):
    x = check_array(x, force_all_finite=False)
    return np.sqrt(np.where(x < 0, 0, x))

def df_converter_func(x):
    return pd.DataFrame(x, columns=num_cols + cat_cols)

# --------------------------
# DATA PREPARATION
# --------------------------
df = pd.read_excel('Raw_Data_v0.xlsx', engine='openpyxl')
df = df.drop(columns=[
    'Ref#', 'Heat treatment', 'Other RM/Rivet/part cost (€/Part)',
    'Gross Weight (g)', 'Other assembled RM/Rivet/part',
])

num_cols = [
    'Annual target quantity',
    'Raw Material Cost (€/kg)',
    'Thickness (mm)',
    'Part Net Weight (g)',
    'Surface Treatment cost (€/Part)',
    'Final Raw Material cost (€/Part)',
    'Heat Treatment cost (€/Part)'
]

cat_cols = [
    'Production',
    'Raw Material Designation',
    'Surface Treatment', 
    'Raw Material'
]

df[num_cols] = df[num_cols].fillna(0)
df[cat_cols] = df[cat_cols].fillna('Missing')

TARGET = 'Total cost with amortization (€/part)'

# --------------------------
# METRICS
# --------------------------
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)
    within_10 = np.mean(np.abs((y_true - y_pred)/y_true) <= 0.1) * 100
    return {'MAE': mae, 'RMSE': rmse, 'MAPE': mape, 
            'R2': r2, 'Within_10%': within_10}

# --------------------------
# STRATEGIES
# --------------------------
strategies = {
    'Strategy 1 (IQR+Robust)': {
        'has_outlier_removal': True,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('num', RobustScaler(), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': None
    },
    'Strategy 2 (IQR+Robust+LogY)': {
        'has_outlier_removal': True,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('num', RobustScaler(), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.log
    },
    'Strategy 3 (LogAll)': {
        'has_outlier_removal': False,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('log', FunctionTransformer(safe_log), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.log
    },
    'Strategy 4 (LogY+RobustX)': {
        'has_outlier_removal': False,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('num', RobustScaler(), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.log
    },
    'Strategy 5 (SqrtAll)': {
        'has_outlier_removal': False,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('sqrt', FunctionTransformer(safe_sqrt), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.sqrt
    },
    'Strategy 6 (SqrtY+RobustX)': {
        'has_outlier_removal': False,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('num', RobustScaler(), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.sqrt
    },
    'Strategy 7 (IQR+Robust+SqrtY)': {
        'has_outlier_removal': True,
        'preprocess': Pipeline([
            ('cat_converter', CategoricalConverter(cat_cols)),
            ('column_transformer', ColumnTransformer(
                [('num', RobustScaler(), num_cols)],
                remainder='passthrough',
                verbose_feature_names_out=False
            )),
            ('df_converter', FunctionTransformer(df_converter_func))
        ]),
        'target_transform': np.sqrt
    }
}

# --------------------------
# MAIN EXECUTION
# --------------------------
results = []
cat_indices = [i for i, col in enumerate(num_cols + cat_cols) if col in cat_cols]

model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.1,
    depth=6,
    l2_leaf_reg=1,
    cat_features=cat_indices,
    early_stopping_rounds=20,
    verbose=False
)

kf = KFold(n_splits=3, shuffle=True, random_state=42)
outlier_remover = IQROutlierRemover()

for strategy_name, strategy in strategies.items():
    print(f"\n{strategy_name}:")
    fold_metrics = []
    
    for train_idx, test_idx in kf.split(df):
        train_df = df.iloc[train_idx]
        test_df = df.iloc[test_idx]
        
        X_train = train_df.drop(columns=[TARGET])
        y_train = train_df[TARGET].values
        
        # Apply outlier removal if needed
        if strategy['has_outlier_removal']:
            X_train_filtered = outlier_remover.fit_transform(X_train, y_train)
            y_train_filtered = outlier_remover.filter_y(y_train)
        else:
            X_train_filtered = X_train
            y_train_filtered = y_train
        
        # Fit and transform training data
        X_train_processed = strategy['preprocess'].fit_transform(X_train_filtered)
        
        # Model setup
        model_pipe = TransformedTargetRegressor(
            regressor=model,
            func=strategy['target_transform'],
            inverse_func=np.exp if strategy['target_transform'] == np.log else (
                np.square if strategy['target_transform'] == np.sqrt else lambda x: x)
        )
        
        # Training
        model_pipe.fit(X_train_processed, y_train_filtered)
        
        # Test-time preprocessing (no outlier removal)
        X_test = test_df.drop(columns=[TARGET])
        X_test_processed = strategy['preprocess'].transform(X_test)
        y_test = test_df[TARGET].values
        
        # Prediction and evaluation
        y_pred = model_pipe.predict(X_test_processed)
        fold_metrics.append(calculate_metrics(y_test, y_pred))
    
    # Aggregate results
    metrics_df = pd.DataFrame(fold_metrics)
    results.append({
        'Strategy': strategy_name,
        'MAE': f"{metrics_df['MAE'].mean():.2f} ± {metrics_df['MAE'].std():.2f}",
        'RMSE': f"{metrics_df['RMSE'].mean():.2f} ± {metrics_df['RMSE'].std():.2f}",
        'MAPE': f"{metrics_df['MAPE'].mean():.1f}% ± {metrics_df['MAPE'].std():.1f}",
        'R²': f"{metrics_df['R2'].mean():.3f} ± {metrics_df['R2'].std():.3f}",
        'Within ±10%': f"{metrics_df['Within_10%'].mean():.1f}% ± {metrics_df['Within_10%'].std():.1f}"
    })

# --------------------------
# FINAL RESULTS
# --------------------------
results_df = pd.DataFrame(results).set_index('Strategy')
print("\n" + "="*60)
print("Final Comparison Results:")
print("="*60)
display(results_df)



Strategy 1 (IQR+Robust):

Strategy 2 (IQR+Robust+LogY):

Strategy 3 (LogAll):

Strategy 4 (LogY+RobustX):

Strategy 5 (SqrtAll):

Strategy 6 (SqrtY+RobustX):

Strategy 7 (IQR+Robust+SqrtY):

Final Comparison Results:


Unnamed: 0_level_0,MAE,RMSE,MAPE,R²,Within ±10%
Strategy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Strategy 1 (IQR+Robust),0.22 ± 0.08,0.65 ± 0.20,61.3% ± 25.0,0.048 ± 0.056,22.0% ± 6.6
Strategy 2 (IQR+Robust+LogY),0.22 ± 0.08,0.66 ± 0.20,42.8% ± 5.6,0.021 ± 0.040,24.5% ± 5.7
Strategy 3 (LogAll),0.12 ± 0.06,0.38 ± 0.22,40.0% ± 10.7,0.680 ± 0.211,22.0% ± 5.8
Strategy 4 (LogY+RobustX),0.12 ± 0.06,0.38 ± 0.22,39.9% ± 10.6,0.680 ± 0.211,22.0% ± 6.6
Strategy 5 (SqrtAll),0.10 ± 0.04,0.27 ± 0.11,42.8% ± 12.0,0.845 ± 0.055,25.8% ± 5.4
Strategy 6 (SqrtY+RobustX),0.10 ± 0.04,0.27 ± 0.11,43.1% ± 12.0,0.844 ± 0.056,26.4% ± 6.5
Strategy 7 (IQR+Robust+SqrtY),0.22 ± 0.08,0.65 ± 0.20,45.2% ± 6.9,0.035 ± 0.045,25.8% ± 8.5
