In [None]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression

# Sample dataset
data = pd.DataFrame({
    'month': ['January', 'February', 'January', 'March'],
    'campaign_type': ['A', 'B', 'A', 'C'],
    'cost': [500, 300, 700, 200],
    'free_trials': [50, 30, 60, 25]
})

# Define feature columns
categorical_columns = ['month', 'campaign_type']
numerical_columns = ['cost']

# Custom transformer for interaction terms
class InteractionFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_columns, interaction_columns):
        self.numerical_columns = numerical_columns  # Numerical columns
        self.interaction_columns = interaction_columns  # One-hot encoded column names
        self.feature_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Convert NumPy array to DataFrame using saved column names
        X_df = pd.DataFrame(X, columns=self.feature_names)

        # Create interaction terms
        interaction_features = []
        for num_col in self.numerical_columns:
            for int_col in self.interaction_columns:
                if num_col in X_df.columns and int_col in X_df.columns:
                    interaction_features.append(X_df[num_col] * X_df[int_col])
                else:
                    raise KeyError(f"Column '{int_col}' not found in transformed features.")

        # Combine the original and interaction features
        interaction_features = np.column_stack(interaction_features)
        return np.hstack([X, interaction_features])

# Preprocessor to handle categorical and numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_columns),  # One-hot encode categories
        ('num', StandardScaler(), numerical_columns)  # Scale numerical columns
    ],
    remainder='passthrough'  # Keep other columns as-is
)

# Fit the preprocessor first to get feature names
preprocessor.fit(data[['month', 'campaign_type', 'cost']])
feature_names = preprocessor.get_feature_names_out()

# Print the transformed feature names for debugging
print("Transformed feature names:", feature_names)

# Dynamically map the interaction columns
interaction_columns = [name for name in feature_names if 'month_January' in name or 'campaign_type_A' in name]

# Define pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('interaction', InteractionFeatures(
        numerical_columns=['num__cost'],  # Adjusted for transformed column names
        interaction_columns=interaction_columns
    )),
    ('model', LinearRegression())  # Example model
])

# Split features and target
X = data[['month', 'campaign_type', 'cost']]
y = data['free_trials']

# Set column names dynamically for the interaction step
pipeline.named_steps['interaction'].feature_names = feature_names

# Fit the pipeline
pipeline.fit(X, y)

# Predict
predictions = pipeline.predict(X)
print(predictions)


Transformed feature names: ['cat__month_January' 'cat__month_March' 'cat__campaign_type_B'
 'cat__campaign_type_C' 'num__cost']
[50. 30. 60. 25.]


In [11]:
'''
Key Changes for Efficiency
NumPy-Based Interaction:

Instead of using DataFrames for interaction terms, we compute them directly using NumPy arrays. This avoids the overhead of creating and managing DataFrames.
Column Indexing:

Instead of referencing columns by name (which is lost after ColumnTransformer), we use column indices to locate numerical and one-hot encoded features.
Dynamic Interaction Columns:

Numerical and categorical column indices are dynamically determined based on the transformed output.
Simplified Logic:

The pipeline flow is logical:
Preprocess first (preprocessor).
Apply interactions (InteractionFeatures).
Fit the model (LinearRegression).

'''
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression

# Sample dataset
data = pd.DataFrame({
    'month': ['January', 'February', 'January', 'March'],
    'campaign_type': ['A', 'B', 'A', 'C'],
    'cost': [500, 300, 700, 200],
    'free_trials': [50, 30, 60, 25]
})

# Define feature columns
categorical_columns = ['month', 'campaign_type']
numerical_columns = ['cost']

# Custom transformer for interaction terms
class InteractionFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_columns, interaction_indices):
        self.numerical_columns = numerical_columns  # List of numerical column indices
        self.interaction_indices = interaction_indices  # List of one-hot encoded column indices

    def fit(self, X, y=None):
        return self  # No fitting necessary

    def transform(self, X):
        # Compute interaction terms
        interaction_features = []
        for num_idx in self.numerical_columns:
            for int_idx in self.interaction_indices:
                interaction_features.append(X[:, num_idx] * X[:, int_idx])

        # Combine the original and interaction features
        interaction_features = np.column_stack(interaction_features)
        return np.hstack([X, interaction_features])

# Preprocessor to handle categorical and numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_columns),  # One-hot encode categories
        ('num', StandardScaler(), numerical_columns)  # Scale numerical columns
    ],
    remainder='passthrough'  # Keep other columns as-is
)

# Define pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Apply preprocessing first
    ('interaction', InteractionFeatures(
        numerical_columns=[preprocessor.transformers_[1][2].index('cost')],  # Cost column index in the scaled output
        interaction_indices=list(range(len(categorical_columns)))  # Indices of one-hot encoded columns
    )),
    ('model', LinearRegression())  # Example model
])

# Split features and target
X = data[['month', 'campaign_type', 'cost']]
y = data['free_trials']

# Fit the pipeline
pipeline.fit(X, y)

# Predict
predictions = pipeline.predict(X)
print("Predictions:", predictions)


AttributeError: 'ColumnTransformer' object has no attribute 'transformers_'

In [None]:
def fit(self, X, y=None):
        return self  # No fitting necessary

    def transform(self, X):
        # Multiply numerical columns with one-hot encoded columns
        print("What is X:", X)


In [None]:
 print("Type | X:", type(X))
        interaction_features = []
        for num_col in self.numerical_columns:
            for int_col in self.interaction_columns:
                interaction_features.append(X[num_col] * X[int_col])
        # Combine the original and interaction features
        interaction_features = np.column_stack(interaction_features)
        return np.hstack([X.values, interaction_features])

# Define preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_columns),  # One-hot encode categories
        ('num', StandardScaler(), numerical_columns)  # Scale numerical columns
    ],
    remainder='passthrough'  # Keep other columns as-is
)

# Define pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('interaction', InteractionFeatures(
        numerical_columns=['cost'],
        interaction_columns=['cat__month_January', 'cat__campaign_type_A']  # Specify interaction columns
    )),
    ('model', LinearRegression())  # Example model
])

# Split features and target
X = data[['month', 'campaign_type', 'cost']]
y = data['free_trials']

# Fit the pipeline
pipeline.fit(X, y)

# Predict
predictions = pipeline.predict(X)
