In [38]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

class ScoreFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    Computes new features based on the target column 'score'.

    Features computed:
      - lag_score: The previous score for each student and subject
      - running_avg: The average of all past scores for each student and subject
      - subject_loo_avg: For each subject, the leave-one-out average score
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()
        # Sort by name, subject, and date so that lag and running average are computed in order
        df = df.sort_values(['name', 'subject', 'date']).reset_index(drop=True)

        # Compute lag feature
        df['lag_score'] = df.groupby(['name', 'subject'])['score'].shift(1)

        # Compute running average - fixed version
        df['running_avg'] = df.groupby(['name', 'subject'])['score'].transform(
            lambda x: x.expanding().mean().shift(1)
        )

        # Compute leave-one-out average for the subject
        def loo_avg(s):
            total = s.sum()
            count = s.count()
            return pd.Series(
                [(total - x) / (count - 1) if count > 1 else np.nan for x in s],
                index=s.index
            )

        df['subject_loo_avg'] = df.groupby('subject')['score'].transform(loo_avg)

        # Drop the target column
        df = df.drop(columns=['score'])
        return df

# Sample data
data = {
    'gender': ['F', 'M', 'F', 'M', 'F', 'M', 'F', 'M'],
    'name': ['Alice', 'Bob', 'Alice', 'Bob', 'Alice', 'Bob', 'Alice', 'Bob'],
    'score': [85, 90, 88, 92, 87, 89, 90, 95],
    'subject': ['Math', 'Math', 'Science', 'Science', 'Math', 'Science', 'Science', 'Science'],
    'date': pd.to_datetime([
        '2023-01-10', '2023-01-12',
        '2023-01-15', '2023-01-16',
        '2023-02-10', '2023-02-12',
        '2023-02-15', '2023-02-16'
    ])
}

df = pd.DataFrame(data)

# Train/Test Split
X_with_score = df.copy()
y = X_with_score['score']
X_train, X_test, y_train, y_test = train_test_split(
    X_with_score, y, test_size=0.3, shuffle=False
)

# Date conversion function
def convert_date(df):
    df = df.copy()
    df['date_numeric'] = df['date'].astype(np.int64) // 10**9
    return df

# Apply date conversion
X_train = convert_date(X_train)
X_test = convert_date(X_test)

# Define features
numeric_features = ['lag_score', 'running_avg', 'subject_loo_avg', 'date_numeric']
categorical_features = ['gender', 'name', 'subject']

# Create preprocessor with imputation
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create and train pipeline
pipeline = Pipeline([
    ('feature_eng', ScoreFeatureEngineer()),
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Make predictions
predictions = pipeline.predict(X_test)

print("\nPredictions on Test Set:")
print(predictions)
print("\nActual Test Scores:")
print(y_test.values)


Predictions on Test Set:
[ 98.11578947 118.83157895 104.78947368]

Actual Test Scores:
[89 90 95]


In [39]:
df

Unnamed: 0,gender,name,score,subject,date
0,F,Alice,85,Math,2023-01-10
1,M,Bob,90,Math,2023-01-12
2,F,Alice,88,Science,2023-01-15
3,M,Bob,92,Science,2023-01-16
4,F,Alice,87,Math,2023-02-10
5,M,Bob,89,Science,2023-02-12
6,F,Alice,90,Science,2023-02-15
7,M,Bob,95,Science,2023-02-16


In [40]:
df = df.sort_values(['name', 'subject', 'date']).reset_index(drop=True)
df['lag_score'] = df.groupby(['name', 'subject'])['score'].shift(1)

In [41]:
# First sort by date within each group
df = df.sort_values(['name', 'subject', 'date']).reset_index(drop=True)

# Then calculate the running average
df['running_avg'] = df.groupby(['name', 'subject'])['score'].transform(
    lambda x: x.expanding().mean().shift(1)
)
df

Unnamed: 0,gender,name,score,subject,date,lag_score,running_avg
0,F,Alice,85,Math,2023-01-10,,
1,F,Alice,87,Math,2023-02-10,85.0,85.0
2,F,Alice,88,Science,2023-01-15,,
3,F,Alice,90,Science,2023-02-15,88.0,88.0
4,M,Bob,90,Math,2023-01-12,,
5,M,Bob,92,Science,2023-01-16,,
6,M,Bob,89,Science,2023-02-12,92.0,92.0
7,M,Bob,95,Science,2023-02-16,89.0,90.5


In [42]:
# Compute leave-one-out average for the subject
def loo_avg(s):
    print("s:", s)
    total = s.sum()
    print("total:", total)
    count = s.count()
    print("count:", count)
    return pd.Series(
        [(total - x) / (count - 1) if count > 1 else np.nan for x in s],
        index=s.index
    )

df['subject_loo_avg'] = df.groupby('subject')['score'].transform(loo_avg)
df

s: 0    85
1    87
4    90
Name: Math, dtype: int64
total: 262
count: 3
s: 2    88
3    90
5    92
6    89
7    95
Name: Science, dtype: int64
total: 454
count: 5


Unnamed: 0,gender,name,score,subject,date,lag_score,running_avg,subject_loo_avg
0,F,Alice,85,Math,2023-01-10,,,88.5
1,F,Alice,87,Math,2023-02-10,85.0,85.0,87.5
2,F,Alice,88,Science,2023-01-15,,,91.5
3,F,Alice,90,Science,2023-02-15,88.0,88.0,91.0
4,M,Bob,90,Math,2023-01-12,,,86.0
5,M,Bob,92,Science,2023-01-16,,,90.5
6,M,Bob,89,Science,2023-02-12,92.0,92.0,91.25
7,M,Bob,95,Science,2023-02-16,89.0,90.5,89.75


In [11]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Sample data with 5 columns: gender, name, score, subject, date.
data = {
    'gender': ['F', 'M', 'F', 'M', 'F', 'M', 'F', 'M'],
    'name': ['Alice', 'Bob', 'Alice', 'Bob', 'Alice', 'Bob', 'Alice', 'Bob'],
    'score': [85, 90, 88, 92, 87, 89, 90, 95],
    'subject': ['Math', 'Math', 'Science', 'Science', 'Math', 'Math', 'Science', 'Science'],
    'date': pd.to_datetime([
        '2023-01-10', '2023-01-12',
        '2023-01-15', '2023-01-16',
        '2023-02-10', '2023-02-12',
        '2023-02-15', '2023-02-16'
    ])
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# =============================================================================
# Custom Transformer for Feature Engineering
# =============================================================================
class ScoreFeatureEngineer(BaseEstimator, TransformerMixin):
    """
    Computes new features based on the target column 'score'.

    Features computed:
      - lag_score: The previous score for each student and subject.
      - running_avg: The average of all past scores for each student and subject.
      - subject_loo_avg: For each subject, the leave-one-out average score (i.e. average score
        computed using all other rows in that subject).

    After computing these features, the transformer drops the original 'score' column so that
    it is not used as an input feature in the model.

    Note: In a real-world time-aware scenario, you would need to ensure that only past data are
          used to compute these features. Here, we demonstrate a leave-one-out and lag approach.
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        # Nothing special to learn from X; all computations are done per group.
        return self

    def transform(self, X):
        df = X.copy()
        # Sort by name, subject, and date so that lag and running average are computed in order.
        df = df.sort_values(['name', 'subject', 'date']).reset_index(drop=True)

        # Compute lag feature: previous score for each student in each subject.
        df['lag_score'] = df.groupby(['name', 'subject'])['score'].shift(1)

        # Compute running average: average of all previous scores.
        df['running_avg'] = df.groupby(['name', 'subject'])['score'].expanding().mean().shift(1)

        # Compute leave-one-out average for the subject.
        def loo_avg(s):
            total = s.sum()
            count = s.count()
            return s.apply(lambda x: (total - x) / (count - 1) if count > 1 else np.nan)

        df['subject_loo_avg'] = df.groupby('subject')['score'].transform(loo_avg)

        # Return the DataFrame without the target column.
        df = df.drop(columns=['score'])
        return df

# =============================================================================
# Train/Test Split (Simulating Production)
# =============================================================================
# IMPORTANT: To avoid leakage in a time-aware scenario, you should perform a time-based split.
# Here, we simply do a train_test_split with shuffle=False.
# Also note: we keep 'score' in X so that our custom transformer can compute features,
# but then it drops 'score' so the model does not see it.
X_with_score = df.copy()
y = X_with_score['score']

# For demonstration, we do a time-aware split by not shuffling (assume rows are in order).
X_train, X_test, y_train, y_test = train_test_split(X_with_score, y, test_size=0.3, shuffle=False)

# =============================================================================
# Preprocessor: Handling Numeric and Categorical Features
# =============================================================================
# We need to process the newly engineered features (which are numeric) along with other numeric features.
# For the date, we convert it to a numeric value (e.g., timestamp).
# (In a real scenario you might extract year/month/day features.)
def convert_date(df):
    df = df.copy()
    df['date_numeric'] = df['date'].astype(np.int64) // 10**9  # seconds since epoch
    return df

# Apply this conversion to training and test sets.
X_train = convert_date(X_train)
X_test = convert_date(X_test)

# List of numeric features (after feature engineering these will be available)
numeric_features = ['lag_score', 'running_avg', 'subject_loo_avg', 'date_numeric']
# List of categorical features (we exclude 'date' because we converted it)
categorical_features = ['gender', 'name', 'subject']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# =============================================================================
# Build the Pipeline
# =============================================================================
# The pipeline does the following steps:
#  1. Applies the custom feature engineering transformer (which computes lag, running average,
#     and leave-one-out features, then drops the target 'score').
#  2. Applies the preprocessor (scaling numeric features and one-hot encoding categoricals).
#  3. Fits a model (LinearRegression in this example).
pipeline = Pipeline([
    ('feature_eng', ScoreFeatureEngineer()),
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# =============================================================================
# Train the Pipeline
# =============================================================================
pipeline.fit(X_train, y_train)

# =============================================================================
# Predict on the Test Set
# =============================================================================
predictions = pipeline.predict(X_test)

print("\nPredictions on Test Set:")
print(predictions)
print("\nActual Test Scores:")
print(y_test.values)


ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values