Skip to content

Commit

Permalink
Integrate feature preprocessor as step in SKLL learner pipeline and a…
Browse files Browse the repository at this point in the history
…lso save pipeline separately
  • Loading branch information
mulhod committed Jul 18, 2022
1 parent 933d17b commit b173a92
Showing 1 changed file with 98 additions and 4 deletions.
102 changes: 98 additions & 4 deletions rsmtool/modeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,22 @@
:organization: ETS
"""

import copy
import logging
import pickle
from math import log10, sqrt
from os.path import join
import warnings

import joblib
import numpy as np
import pandas as pd
import statsmodels.api as sm
from numpy.random import RandomState
from scipy.optimize import nnls
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
from skll.data import FeatureSet
from skll.learner import Learner

Expand All @@ -30,6 +35,75 @@
from .writer import DataWriter


class RSMToolFeaturePreprocessor(BaseEstimator, TransformerMixin):
"""
`scikit-learn`-compatible feature processor that uses RSMTool.
This processor can be fit with some training data and then used to
transform new features as part of a pipeline. The input and output
of the `transform` function are iterables of JSON feature
dictionaries.
"""

def __init__(self, standardize_features=True):
"""
Initialize object with list of feature names, etc.
Parameters
----------
standardize_features : bool, optional
Whether or not to standardize feature values
"""

self.standardize_features = standardize_features
self.processor = FeaturePreprocessor()

def fit(self, X=None, y=None, feature_info=None):
"""
Fit the feature preprocessor with some training data.
Parameters
----------
X : iterable of feature dictionaries
Not used, only here for compatibility
y : iterable of target variable values
Not used, only here for compatibility
feature_info : pd.DataFrame
Feature info frame
"""

self.df_feature_info = feature_info.copy()
self.df_feature_info.set_index("feature", inplace=True)
return self

def transform(self, X, y=None):
"""
Parameters
----------
X : iterable of feature dictionaries
Feature dictionaries
y : None
This parameter exists only to conform with the API. It is
not used.
Returns
-------
List[Dict[str, Any]]
iterable of transformed feature dictionaries
"""

features = pd.DataFrame([{"spkitemid": i, **x} for i, x in enumerate(X)])
with warnings.catch_warnings():
warnings.simplefilter(action="ignore", category=FutureWarning)
features, _ = self.processor.preprocess_new_data(
features,
self.df_feature_info,
standardize_features=self.standardize_features,
)
del features["spkitemid"]
return features.to_dict(orient="records")


class Modeler:
"""
Class to train model and generate predictions with built-in or SKLL models.
Expand Down Expand Up @@ -256,6 +330,13 @@ def create_fake_skll_learner(self, df_coefficients):
# now create its parameters from the coefficients from the built-in model
learner.model.coef_ = learner.feat_vectorizer.transform(coefdict).toarray()[0]
learner.model.intercept_ = intercept
learner.pipeline = Pipeline(
[
('vectorizer', copy.deepcopy(learner.feat_vectorizer)),
('estimator', copy.deepcopy(learner.model)),
]
)

return learner

def train_linear_regression(self, df_train, feature_columns):
Expand Down Expand Up @@ -463,7 +544,7 @@ def train_lasso_fixed_lambda_then_lr(self, df_train, feature_columns):
# note that 'alpha' in sklearn is different from this lambda
# so we need to normalize looking at the sklearn objective equation
p_alpha = p_lambda / len(df_train)
l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True}, pipeline=True)
l_lasso.train(fs_train, grid_search=False)

# get the feature names that have the non-zero coefficients
Expand Down Expand Up @@ -716,7 +797,7 @@ def train_lasso_fixed_lambda_then_non_negative_lr(self, df_train, feature_column
# note that 'alpha' in sklearn is different from this lambda
# so we need to normalize looking at the sklearn objective equation
p_alpha = p_lambda / len(df_train)
l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True}, pipeline=True)
l_lasso.train(fs_train, grid_search=False)

# get the feature names that have the non-zero coefficients
Expand Down Expand Up @@ -805,7 +886,7 @@ def train_lasso_fixed_lambda(self, df_train, feature_columns):
# note that 'alpha' in sklearn is different from this lambda
# so we need to normalize looking at the sklearn objective equation
alpha = p_lambda / len(df_train)
learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True})
learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True}, pipeline=True)
learner.train(fs_train, grid_search=False)

# convert this model's parameters to a data frame
Expand Down Expand Up @@ -1112,7 +1193,8 @@ def train_skll_model(self,
model_kwargs = custom_fixed_parameters if custom_fixed_parameters is not None else {}
learner = Learner(model_name,
model_kwargs=model_kwargs,
probability=predict_expected_scores)
probability=predict_expected_scores,
pipeline=True)

# get the features, IDs, and labels from the given data frame
feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']]
Expand Down Expand Up @@ -1200,6 +1282,18 @@ def train(self,
else:
model = self.train_builtin_model(*args, **kwargs)

# Generate an `sklearn`-style feature preprocessor to be used
# later in the model's pipeline
processor = RSMToolFeaturePreprocessor(standardize_features=True)
processor.fit(feature_info=data_container.feature_info)
pipeline = Pipeline(
[("rsmtool_feature_preprocessor", processor)] +
list(model.pipeline.named_steps.items())
)
pipeline_file_path = join(filedir, '{}.pipeline.model'.format(experiment_id))
with open(pipeline_file_path, "wb") as pipefile_file:
joblib.dump(pipeline, pipeline_file_path)

return model

def predict(self, df, min_score, max_score, predict_expected=False):
Expand Down

0 comments on commit b173a92

Please sign in to comment.