In [21]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression

# Sample dataset
data = pd.DataFrame({
    'month': ['January', 'February', 'January', 'March'],
    'campaign_type': ['A', 'B', 'A', 'C'],
    'cost': [500, 300, 700, 200],
    'free_trials': [50, 30, 60, 25]
})

# Define feature columns
categorical_columns = ['month', 'campaign_type']
numerical_columns = ['cost']

# Custom transformer for interaction terms
class InteractionFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_columns, interaction_columns):
        self.numerical_columns = numerical_columns  # Numerical columns
        self.interaction_columns = interaction_columns  # One-hot encoded column names
        self.feature_names = None

    def fit(self, X, y=None):
        # print("where are you?")
        return self

    def transform(self, X):
        # print("confirm Transforming???")
        # Convert NumPy array to DataFrame using saved column names
        print("X:", X)
        X_df = pd.DataFrame(X, columns=self.feature_names)
        print("X_df:", X_df)
        print("self.numerical_columns:", self.numerical_columns)
        print("interaction_columns:", self.interaction_columns)
        print("self.feature_names", self.feature_names)
        # Create interaction terms
        interaction_features = []
        for num_col in self.numerical_columns:
            for int_col in self.interaction_columns:
                if num_col in X_df.columns and int_col in X_df.columns:
                    interaction_features.append(X_df[num_col] * X_df[int_col])
                else:
                    raise KeyError(f"Column '{int_col}' not found in transformed features.")

        # Combine the original and interaction features
        interaction_features = np.column_stack(interaction_features)
        return np.hstack([X, interaction_features])

# Preprocessor to handle categorical and numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(drop='first', sparse_output=False), categorical_columns),  # One-hot encode categories
        ('num', StandardScaler(), numerical_columns)  # Scale numerical columns
    ],
    remainder='passthrough'  # Keep other columns as-is
)

# Fit the preprocessor first to get feature names
preprocessor.fit(data[['month', 'campaign_type', 'cost']])
# preprocessor.get_feature_names_out() retrieves all the feature names that are created or transformed by the ColumnTransformer,
# including those generated by transformers like OneHotEncoder, PCA, or even custom transformers that generate new features.
# and also passthrough variables
feature_names = preprocessor.get_feature_names_out()

# Print the transformed feature names for debugging
# print("Transformed feature names:", feature_names)

# Dynamically map the interaction columns
interaction_columns = [name for name in feature_names if 'month_January' in name or 'campaign_type_A' in name]
# print("\nInteractive columns:", interaction_columns)
# Define pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('interaction', InteractionFeatures(
        numerical_columns=['num__cost'],  # Adjusted for transformed column names
        interaction_columns=interaction_columns
    )),
    ('model', LinearRegression())  # Example model
])

# Split features and target
X = data[['month', 'campaign_type', 'cost']]
y = data['free_trials']

# This assigns the feature_names, from the preprocessor, to the 'interaction' transformer in the pipeline.
# The 'named_steps' allows you to access specific steps by their name (e.g., 'interaction').
pipeline.named_steps['interaction'].feature_names = feature_names

# Fit the pipeline
# print("Fitting???")
pipeline.fit(X, y)
# print("Transforming???")
# Predict
predictions = pipeline.predict(X)
print(predictions)


X: [[ 1.          0.          0.          0.          0.39056673]
 [ 0.          0.          1.          0.         -0.65094455]
 [ 1.          0.          0.          0.          1.43207802]
 [ 0.          1.          0.          1.         -1.1717002 ]]
X_df:    ohe__month_January  ohe__month_March  ohe__campaign_type_B  \
0                 1.0               0.0                   0.0   
1                 0.0               0.0                   1.0   
2                 1.0               0.0                   0.0   
3                 0.0               1.0                   0.0   

   ohe__campaign_type_C  num__cost  
0                   0.0   0.390567  
1                   0.0  -0.650945  
2                   0.0   1.432078  
3                   1.0  -1.171700  
self.numerical_columns: ['num__cost']
interaction_columns: ['ohe__month_January']
self.feature_names ['ohe__month_January' 'ohe__month_March' 'ohe__campaign_type_B'
 'ohe__campaign_type_C' 'num__cost']
X: [[ 1.          0.     

In [13]:
'''
Key Changes for Efficiency
 I. NumPy-Based Interaction:
        a. Instead of using DataFrames for interaction terms, we compute them directly using NumPy arrays. This avoids the overhead of creating and managing DataFrames.
 II. Column Indexing:
        a. Instead of referencing columns by name (which is lost after ColumnTransformer), we use column indices to locate numerical and one-hot encoded features.
 III. Dynamic Interaction Columns:
        a. Numerical and categorical column indices are dynamically determined based on the transformed output.
 IV. Simplified Logic:
        a. The pipeline flow is logical: Preprocess first (preprocessor), apply interactions (InteractionFeatures), and fit the model (LinearRegression).

How It Works
 I. Preprocessing with ColumnTransformer:
        a. One-hot encoding and scaling are applied to the categorical and numerical features.
        b. The output is a NumPy array.
 II. Custom Transformer (InteractionFeatures):
        a. Computes interaction terms using indices for numerical and one-hot encoded features.
        b. Efficiently handles interaction computation using NumPy.
 III. Pipeline:
        a. Handles preprocessing, interaction computation, and model fitting in a seamless and efficient manner.

Advantages of This Approach
 I. Efficiency:
        a. No unnecessary conversions between DataFrames and NumPy arrays.
        b. Uses NumPy for matrix computations, which is faster.
 II. Maintainability:
        a. Logical flow with minimal hardcoding.
        b. Indices are derived dynamically, making it easier to adapt to new data.
 III. Scalability:
        a. Designed to handle larger datasets and more complex pipelines without significant overhead.

Output
 I. When you run the code:
        a. The pipeline processes the data, computes interaction terms, and fits the LinearRegression model.
        b. Predictions are successfully generated and printed.


'''
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression

# Sample dataset
data = pd.DataFrame({
    'month': ['January', 'February', 'January', 'March'],
    'campaign_type': ['A', 'B', 'A', 'C'],
    'cost': [500, 300, 700, 200],
    'free_trials': [50, 30, 60, 25]
})

# Define feature columns
categorical_columns = ['month', 'campaign_type']
numerical_columns = ['cost']

# Custom transformer for interaction terms
class InteractionFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_indices, interaction_indices):
        self.numerical_indices = numerical_indices  # List of numerical column indices
        self.interaction_indices = interaction_indices  # List of one-hot encoded column indices

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Compute interaction terms
        interaction_features = []
        for num_idx in self.numerical_indices:
            for int_idx in self.interaction_indices:
                interaction_features.append(X[:, num_idx] * X[:, int_idx])

        # Combine the original and interaction features
        interaction_features = np.column_stack(interaction_features)
        return np.hstack([X, interaction_features])

# Preprocessor to handle categorical and numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_columns),  # One-hot encode categories
        ('num', StandardScaler(), numerical_columns)  # Scale numerical columns
    ],
    remainder='passthrough'  # Keep other columns as-is
)

# Fit the preprocessor separately to determine feature names
preprocessor.fit(data[['month', 'campaign_type', 'cost']])
feature_names = preprocessor.get_feature_names_out()

# Map column indices
numerical_indices = [i for i, name in enumerate(feature_names) if 'cost' in name]
interaction_indices = [i for i, name in enumerate(feature_names) if 'month_January' in name or 'campaign_type_A' in name]

# Define pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Apply preprocessing first
    ('interaction', InteractionFeatures(
        numerical_indices=numerical_indices,  # Dynamically determined indices for numerical columns
        interaction_indices=interaction_indices  # Dynamically determined indices for interaction columns
    )),
    ('model', LinearRegression())  # Example model
])

# Split features and target
X = data[['month', 'campaign_type', 'cost']]
y = data['free_trials']

# Fit the pipeline
pipeline.fit(X, y)

# Predict
predictions = pipeline.predict(X)
print("Predictions:", predictions)


Predictions: [50. 30. 60. 25.]
