In [9]:
'''
I. Dropped Categories:
    a.'February' is not directly encoded because of OneHotEncoder(drop='first').
    b. The absence of 'January' and 'March' (i.e., cat__month_January == 0 and cat__month_March == 0) indicates 'February'.
II. Dynamic Interaction Terms:
    a. Interaction terms for 'February' are calculated by detecting rows where all related one-hot columns are 0.
    b. This is handled in the PandasInteractionFeatures.transform method.
III. Preprocessor:
    a. Uses the PandasFeatureExtractor to ensure transformed data retains feature names and is returned as a pandas DataFrame.
IV. Feature Names:
    a. The get_feature_names_out() method of PandasInteractionFeatures dynamically appends names for interaction terms, including those for dropped categories.
V. Pipeline:
    a. Combines all steps (preprocessing, interaction, modeling) into a single, self-contained pipeline.
'''
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

# Sample dataset
data = pd.DataFrame({
    'month': ['January', 'February', 'January', 'March'],
    'campaign_type': ['A', 'B', 'A', 'C'],
    'cost': [500, 300, 700, 200],
    'free_trials': [50, 30, 60, 25]
})

# Define feature columns
categorical_columns = ['month', 'campaign_type']
numerical_columns = ['cost']

# Custom transformer to extract feature names and maintain DataFrame
class PandasFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, transformer):
        self.transformer = transformer
        self.feature_names = None

    def fit(self, X, y=None):
        self.transformer.fit(X, y)
        self.feature_names = self.transformer.get_feature_names_out()
        return self

    def transform(self, X):
        transformed_array = self.transformer.transform(X)
        return pd.DataFrame(transformed_array, columns=self.feature_names, index=X.index)

    def get_feature_names_out(self):
        return self.feature_names

# Custom transformer for interaction terms, handling dropped categories
class PandasInteractionFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_features, interaction_features, dropped_categories=None):
        self.numerical_features = numerical_features  # Names of numerical features
        self.interaction_features = interaction_features  # Names of interaction features
        self.dropped_categories = dropped_categories or {}  # Mapping for dropped categories
        self.feature_names_out_ = None

    def fit(self, X, y=None):
        # This ensures we can reference column names during transformation and later feature naming.
        # fit doesn’t compute anything. It’s a "pass-through" step for pipeline compatibility
        self.feature_names_out_ = X.columns.tolist()
        return self

    def transform(self, X):
        # Create an empty DataFrame with the same index for storing interaction terms.
        # This ensures the final output maintains the same row index as X.
        interaction_features = pd.DataFrame(index=X.index)
        print("interaction_features:", interaction_features)

        # Generate interactions for available features
        for num_feature in self.numerical_features:
            for interaction_feature in self.interaction_features:
                interaction_name = f"{num_feature}_x_{interaction_feature}"
                interaction_features[interaction_name] = X[num_feature] * X[interaction_feature]

        # Handle interactions for dropped categories
        for num_feature in self.numerical_features:
            for category, columns in self.dropped_categories.items():
                interaction_name = f"{num_feature}_x_{category}"
                # Identify rows where all related columns are 0 (indicating the dropped category)
                # Sums up the one-hot encoded values row-wise in order to check for True for rows where the dropped category is present.
                # IE: For any row, if both 'cat__month_January' and 'cat__month_March' are 0,
                #   it implies that the original value was the dropped category ('cat__month_February').
                # rows where sum is 0 means it was month of feburary signaled by True. Rest are False  in a vector
                is_dropped_category = X[columns].sum(axis=1) == 0
                print("is_dropped_category:", is_dropped_category)
                # is_dropped_category.astype(float): Converts the True/False where 1.0 is true and 0.0 is false
                interaction_features[interaction_name] = X[num_feature] * is_dropped_category.astype(float)

        # Combine original and interaction features
        return pd.concat([X, interaction_features], axis=1)

    def get_feature_names_out(self):
        if self.feature_names_out_ is None:
            raise ValueError("You must call fit() before get_feature_names_out().")
        interaction_names = [
            f"{num_feature}_x_{interaction_feature}"
            for num_feature in self.numerical_features
            for interaction_feature in self.interaction_features
        ]
        dropped_names = [
            f"{num_feature}_x_{category}"
            for num_feature in self.numerical_features
            for category in self.dropped_categories.keys()
        ]
        return self.feature_names_out_ + interaction_names + dropped_names

# Preprocessor to handle categorical and numerical columns
preprocessor = PandasFeatureExtractor(
    ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_columns),
            ('num', StandardScaler(), numerical_columns)
        ],
        remainder='passthrough'
    )
)

# Build pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Preprocess features and maintain DataFrame
    ('interaction', PandasInteractionFeatures(
        numerical_features=['num__cost'],  # Use numerical feature names
        interaction_features=['cat__month_January', 'cat__month_March'],  # Interaction feature names
        dropped_categories={'cat__month_February': ['cat__month_January', 'cat__month_March']}  # Handle dropped category
    )),
    ('model', LinearRegression())  # Example model
])

# Features and target
X = data[['month', 'campaign_type', 'cost']]
y = data['free_trials']

# Fit the pipeline
pipeline.fit(X, y)

# Access transformed feature names
final_feature_names = pipeline.named_steps['interaction'].get_feature_names_out()
print("Final Feature Names:", final_feature_names)

# Make predictions
predictions = pipeline.predict(X)
print("Predictions:", predictions)


interaction_features: Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]
is_dropped_category: 0    False
1     True
2    False
3    False
dtype: bool
Final Feature Names: ['cat__month_January', 'cat__month_March', 'cat__campaign_type_B', 'cat__campaign_type_C', 'num__cost', 'num__cost_x_cat__month_January', 'num__cost_x_cat__month_March', 'num__cost_x_cat__month_February']
interaction_features: Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]
is_dropped_category: 0    False
1     True
2    False
3    False
dtype: bool
Predictions: [50. 30. 60. 25.]


In [3]:
transformed_data = pipeline.named_steps['preprocessor'].transform(X)
print(transformed_data.columns)

Index(['cat__month_January', 'cat__month_March', 'cat__campaign_type_B',
       'cat__campaign_type_C', 'num__cost'],
      dtype='object')


In [7]:
'''
Inspect the OneHotEncoder Categories:
'''
encoder = pipeline.named_steps['preprocessor'].transformer.named_transformers_['cat']
print(encoder.categories_)
# Output: [array(['February', 'January', 'March'], dtype=object)]


[array(['February', 'January', 'March'], dtype=object), array(['A', 'B', 'C'], dtype=object)]


In [8]:
'''
Inspect Transformed Feature Names: 'February' gets removed bc it comes frist lexographically
'''
print(pipeline.named_steps['preprocessor'].get_feature_names_out())
# Output: ['cat__month_February', 'cat__month_March', 'num__cost']


['cat__month_January' 'cat__month_March' 'cat__campaign_type_B'
 'cat__campaign_type_C' 'num__cost']
