In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

# Custom transformer to extract feature names
class FeatureNameExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, transformer):
        self.transformer = transformer
        self.feature_names = None

    def fit(self, X, y=None):
        self.transformer.fit(X, y)
        self.feature_names = self.transformer.get_feature_names_out()
        return self

    def transform(self, X):
        return self.transformer.transform(X)

# Custom transformer for interaction terms
class InteractionFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_columns, interaction_names):
        self.numerical_columns = numerical_columns  # Indices of numerical columns
        self.interaction_names = interaction_names  # Indices of interaction columns
        self.original_feature_names = None  # To store feature names of the input
        self.interaction_feature_names = None  # To store generated interaction feature names

    def fit(self, X, y=None):
        # Store the feature names from the preprocessor if available
        if hasattr(X, 'columns'):
            self.original_feature_names = X.columns.tolist()
        elif isinstance(X, np.ndarray):
            # If it's a NumPy array, use externally provided feature names
            self.original_feature_names = [f"feature_{i}" for i in range(X.shape[1])]
        return self

    def transform(self, X):
        # Generate interaction features
        interaction_features = []
        for num_col in self.numerical_columns:
            for interaction_name in self.interaction_names:
                interaction_features.append(X[:, num_col] * X[:, interaction_name])

        # Combine the original and interaction features
        interaction_features = np.column_stack(interaction_features)
        combined = np.hstack([X, interaction_features])
        return combined

    def get_feature_names_out(self, input_features=None):
        # Combine original feature names and interaction feature names
        if self.original_feature_names is None:
            raise ValueError("fit() must be called before get_feature_names_out().")
        return self.original_feature_names + self._generate_interaction_names()

    def _generate_interaction_names(self):
        # Dynamically create names for the interaction features
        return [
            f"{self.original_feature_names[num_col]}_x_{self.original_feature_names[int_name]}"
            for num_col in self.numerical_columns
            for int_name in self.interaction_names
        ]


# Sample data
data = pd.DataFrame({
    'month': ['January', 'February', 'January', 'March'],
    'campaign_type': ['A', 'B', 'A', 'C'],
    'cost': [500, 300, 700, 200],
    'free_trials': [50, 30, 60, 25]
})

categorical_columns = ['month', 'campaign_type']
numerical_columns = ['cost']

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_columns),
        ('num', StandardScaler(), numerical_columns)
    ],
    remainder='passthrough'
)

# Wrap ColumnTransformer with FeatureNameExtractor
preprocessor = FeatureNameExtractor(preprocessor)

# Pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('interaction', InteractionFeatures(
        numerical_columns=[2],  # Index of 'cost' in transformed data
        interaction_names=[0, 1]  # Indices of 'month_January' and 'campaign_type_A'
    ))
])

# Fit the preprocessor separately to get feature names
preprocessor.fit(data[['month', 'campaign_type', 'cost']])
feature_names = preprocessor.feature_names

# Assign feature names to the interaction step
pipeline.named_steps['interaction'].original_feature_names = feature_names

# Fit pipeline
pipeline.fit(data[['month', 'campaign_type', 'cost']], data['free_trials'])

# Access feature names
feature_names = pipeline.named_steps['interaction'].get_feature_names_out()
print("Feature Names:", feature_names)

# Transform data
transformed_data = pipeline.transform(data[['month', 'campaign_type', 'cost']])
print("Transformed Data:\n", transformed_data)


Feature Names: ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_2_x_feature_0', 'feature_2_x_feature_1']
Transformed Data:
 [[ 1.          0.          0.          0.          0.39056673  0.
   0.        ]
 [ 0.          0.          1.          0.         -0.65094455  0.
   0.        ]
 [ 1.          0.          0.          0.          1.43207802  0.
   0.        ]
 [ 0.          1.          0.          1.         -1.1717002   0.
   0.        ]]
