# Install or upgreade an required package into notebook environment

In [1]:
!pip install abacusai --upgrade



# Import Packages required
## We will be using pandas and the Abacus.ai client

In [2]:
import pandas as pd
from abacusai import ApiClient, ApiException

# Build a local model
To begin, we'll read a dataset from s3 into a pandas dataframe. We then manipulate the data, prepare it for training and train a model locally

In [3]:
concrete_df = pd.read_csv(
    "s3://abacusai-exampledatasets/predicting/concrete_measurements.csv"
)
concrete_df.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


### We perform a simple transform. Please be sure to provide a constant we utilize in our transform function
Importing pandas directly in the function body is important for later. Be sure to test your transform function!

In [7]:
def transform_concrete(concrete_dataframe):
    import pandas as pd

    constant = 16
    # assert constant, "Please provide a constant to use in featurization"
    # Remove the flyash feature
    feature_df = concrete_dataframe.drop(["flyash"], axis=1).copy()
    # DataFrame with rows where flyash == 0
    no_flyash = feature_df[concrete_dataframe["flyash"] == 0.0].copy()
    no_flyash.loc[:, "slag"] = no_flyash["slag"] - no_flyash["water"].mean()
    # DataFrame with rows where flyash > 0
    flyash = feature_df[concrete_dataframe["flyash"] > 0.0].copy()
    flyash.loc[:, "slag"] = flyash["slag"] + constant
    return pd.concat([no_flyash, flyash - flyash.assign(age=0).mean()])

In [8]:
transformed_concrete_df = transform_concrete(concrete_df)
transformed_concrete_df

Unnamed: 0,cement,slag,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.000000,-186.616784,162.000000,2.500000,1040.000000,676.000000,28.0,79.990000
1,540.000000,-186.616784,162.000000,2.500000,1055.000000,676.000000,28.0,61.890000
2,332.500000,-44.116784,228.000000,0.000000,932.000000,594.000000,270.0,40.270000
3,332.500000,-44.116784,228.000000,0.000000,932.000000,594.000000,365.0,41.050000
4,198.600000,-54.216784,192.000000,0.000000,978.400000,825.500000,360.0,44.300000
...,...,...,...,...,...,...,...,...
1022,23.427802,69.081250,20.092241,-2.926078,-139.784052,6.173491,28.0,6.885539
1025,35.327802,74.081250,4.192241,0.073922,-102.284052,-15.926509,28.0,9.625539
1026,81.127802,-41.918750,20.592241,1.573922,-154.484052,29.173491,28.0,-3.474461
1027,-92.572198,97.481250,17.292241,-2.726078,-79.984052,-4.226509,28.0,-10.954461


### We now prepare the data for training by splitting it between a train and test split

In [9]:
def assign_train_test_split(data_to_split):
    import pandas as pd
    from sklearn.model_selection import train_test_split

    training_data, prediction_data = train_test_split(
        data_to_split.copy(), test_size=0.1, train_size=0.9, random_state=42
    )
    training_data["TRAINTEST"] = "TRAIN"
    prediction_data["TRAINTEST"] = "TEST"
    return pd.concat([training_data, prediction_data])

In [10]:
concrete_train_test = assign_train_test_split(transformed_concrete_df)
concrete_training_data = concrete_train_test.loc[
    concrete_train_test["TRAINTEST"] == "TRAIN"
].drop(columns={"TRAINTEST"}, axis=1)
concrete_prediction_data = concrete_train_test.loc[
    concrete_train_test["TRAINTEST"] == "TEST"
].drop(columns={"TRAINTEST"}, axis=1)

## Define a local training and predict function
We define a simple linear regression model that utilizes a quantile transformer for numeric columns. We also define a prediction function. Of course, we execute both of these to ensure they work.

In [11]:
def train(training_dataset):
    import numpy as np

    # set the seed for reproduceable results
    np.random.seed(5)

    target_column = "csMPa"
    X = training_dataset.drop([target_column], axis=1).copy()
    y = training_dataset[target_column]
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import QuantileTransformer

    qt = QuantileTransformer(n_quantiles=20)
    linear_regresion = LinearRegression()
    linear_regresion.fit(qt.fit_transform(X), y)
    print(f"Linear model R^2 = {linear_regresion.score(qt.transform(X), y)}")

    return (X.columns, target_column, qt, linear_regresion)

### Examine the train function outputs

In [12]:
local_model = train(concrete_training_data)
input_columns, target, transform, model = local_model
print(input_columns)
print(target)
print(transform)
print(model)

Linear model R^2 = 0.9114936771485455
Index(['cement', 'slag', 'water', 'superplasticizer', 'coarseaggregate',
       'fineaggregate', 'age'],
      dtype='object')
csMPa
QuantileTransformer(n_quantiles=20)
LinearRegression()


In [13]:
def predict_many(model, queries):
    input_columns, target_column, qt, model = model
    import pandas as pd

    predictions = []
    for query in queries:
        X = pd.DataFrame({c: [query[c]] for c in input_columns})
        y = model.predict(qt.transform(X))
        predictions.append({target_column: y[0]})
    return predictions

### Let's see what our model is predicting
We aren't too worried about performance for our toy model

In [14]:
prediction_results = predict_many(
    local_model, concrete_prediction_data.to_dict(orient="records")
)
prediction_results[:5]

[{'csMPa': 37.697797557944256},
 {'csMPa': 47.571887792434},
 {'csMPa': 64.70979460520971},
 {'csMPa': 43.66334359567851},
 {'csMPa': 13.843112761580656}]

# Incorporate our Model into Abacus
With a working local model, we can now leverage the Abacus.ai client and our training/predict code to bring our model into the Abacus Platform

In [15]:
# Instatiate a client instance
client = ApiClient(server='https://workshop.abacus.ai')

## Creating our first project
The first project we create is a Feature Store project. This will act as a location to store feature groups (data transformations) we create.
https://workshop.abacus.ai/help/useCases/feature_store

In [17]:
feature_store_project_name = "csaba_demo"
#assert (
#    feature_store_project_name
#), "Please provide a name for your Feature Store Project"

feature_store_project_exists = False
use_case = "FEATURE_STORE"
feature_store_project = None

for existing_project in client.list_projects():
    if feature_store_project_name == existing_project.name:
        feature_store_project_exists = True
        feature_store_project = existing_project

if not feature_store_project_exists:
    feature_store_project = client.create_project(feature_store_project_name, use_case)
feature_store_project

Project(project_id='7a6b2d5da',
  name='csaba_demo',
  use_case='FEATURE_STORE',
  problem_type='FEATURE_STORE',
  created_at='2023-03-07T18:04:35+00:00',
  feature_groups_enabled=True)

### Manipulating Data in Abacus
We can leverage Feature Groups in Abacus.ai in order to manipulate and transform our data. Each feature group must have a unique, so we define a unique prefix variable for easy uniqueness

In [18]:
unique_prefix = "csaba"
# assert unique_prefix, "Please provide a unique_prefix to avoid name collision"

## We can make Feature Groups directly from Pandas DataFrames

In [19]:
concrete_feature_group_name = unique_prefix + "concrete_attributes"
try:
    concrete_feature_group = client.create_feature_group_from_pandas_df(
        concrete_feature_group_name, df=concrete_df
    )
except ApiException as e:
    print(e)
    concrete_feature_group = client.describe_feature_group_by_table_name(
        concrete_feature_group_name
    )
try:
    concrete_feature_group.add_to_project(feature_store_project.id)
except ApiException as e:
    print(e)
concrete_feature_group.add_tag("base_data")
concrete_feature_group.wait_for_dataset()

FeatureGroup(modification_lock=False,
  feature_group_id='ba9989150',
  name='csabaconcrete_attributes',
  feature_group_source_type='DATASET',
  table_name='csabaconcrete_attributes',
  sql=None,
  dataset_id='88e815d12',
  function_source_code=None,
  function_name=None,
  source_tables=[],
  created_at='2023-03-07T18:04:55+00:00',
  description=None,
  feature_group_type='CUSTOM_TABLE',
  sql_error=None,
  latest_version_outdated=False,
  referenced_feature_groups=None,
  tags=['base_data'],
  primary_key=None,
  update_timestamp_key=None,
  lookup_keys=None,
  streaming_enabled=False,
  feature_group_use=None,
  incremental=False,
  merge_config=None,
  transform_config=None,
  sampling_config=None,
  cpu_size=None,
  memory=None,
  streaming_ready=None,
  feature_tags=None,
  module_name=None,
  template_bindings=None,
  feature_expression='',
  use_original_csv_names=False,
  python_function_bindings=None,
  python_function_name=None,
  annotation_config=None,
  project_config=No

## We can define generalized python functions to generate Feature Groups
First we create the function

In [20]:
python_transform_function_name = unique_prefix + "concrete_python_transformation"
try:
    python_transform_function = client.create_python_function(
        name=python_transform_function_name
    )
except ApiException as e:
    print(e)
    python_transform_function = client.describe_python_function(
        python_transform_function_name
    )

### We then update the functions code defintion. Just like a normal python function, we have variable mappings

In [21]:
python_transform_function = client.update_python_function_code(
    python_transform_function.name, function=transform_concrete
)
python_transform_function.function_variable_mappings

[{'name': 'concrete_dataframe',
  'is_required': True,
  'variable_type': 'FEATURE_GROUP'}]

### Next, we map inputs to our bindings

In [22]:
transform_input_feature_groups = [concrete_feature_group.table_name]
transform_function_bindings = [
    dict({"value": transform_input_feature_groups[index]}, **function_argument)
    for index, function_argument in enumerate(
        python_transform_function.function_variable_mappings
    )
]
transform_function_bindings

[{'value': 'csabaconcrete_attributes',
  'name': 'concrete_dataframe',
  'is_required': True,
  'variable_type': 'FEATURE_GROUP'}]

### Finally, we can create a feature group from this function

In [23]:
transform_fg_name = unique_prefix + "concrete_transform_fg"
try:
    transform_fg = client.create_feature_group_from_function(
        table_name=transform_fg_name,
        python_function_name=python_transform_function.name,
        python_function_bindings=transform_function_bindings,
    )
except ApiException as e:
    print(e)
    transform_fg = client.describe_feature_group_by_table_name(transform_fg_name)
try:
    transform_fg.add_to_project(feature_store_project.id)
except ApiException as e:
    print(e)
transform_fg.add_tag("transform")
transform_fg.add_tag("python")

In [24]:
transform_fg.id, feature_store_project.id

('e51ef9417', '7a6b2d5da')

## We repeat the process, and can pass a python feature group as input to another python function

In [25]:
data_split_function_name = unique_prefix + "train_test_split"
try:
    data_split_function = client.create_python_function(name=data_split_function_name)
except ApiException as e:
    print(e)
    data_split_function = client.describe_python_function(data_split_function_name)

In [26]:
data_split_function = client.update_python_function_code(
    name=data_split_function_name, function=assign_train_test_split
)
data_split_function.function_variable_mappings

[{'name': 'data_to_split',
  'is_required': True,
  'variable_type': 'FEATURE_GROUP'}]

In [27]:
split_input_feature_groups = [transform_fg.table_name]
data_split_bindings = [
    dict({"value": split_input_feature_groups[index]}, **function_argument)
    for index, function_argument in enumerate(
        data_split_function.function_variable_mappings
    )
]
data_split_bindings

[{'value': 'csabaconcrete_transform_fg',
  'name': 'data_to_split',
  'is_required': True,
  'variable_type': 'FEATURE_GROUP'}]

In [28]:
data_split_fg_name = unique_prefix + "concrete_data_split_fg"
try:
    data_split_fg = client.create_feature_group_from_function(
        data_split_fg_name,
        python_function_name=data_split_function.name,
        python_function_bindings=data_split_bindings,
    )
except ApiException as e:
    print(e)
    data_split_fg = client.describe_feature_group_by_table_name(data_split_fg_name)
try:
    data_split_fg.add_to_project(feature_store_project.id)
except ApiException as e:
    print(e)
data_split_fg.add_tag("data_split")
data_split_fg.add_tag("python")

In [29]:
data_split_fg.materialize()
data_split_fg.refresh().features

[Feature(name='cement',
   select_clause=None,
   feature_mapping=None,
   source_table='csabaconcrete_transform_fg',
   original_name=None,
   using_clause=None,
   order_clause=None,
   where_clause=None,
   feature_type='NUMERICAL',
   data_type='FLOAT',
   detected_feature_type='NUMERICAL',
   detected_data_type=None,
   columns=None,
   point_in_time_info=None),
 Feature(name='slag',
   select_clause=None,
   feature_mapping=None,
   source_table='csabaconcrete_transform_fg',
   original_name=None,
   using_clause=None,
   order_clause=None,
   where_clause=None,
   feature_type='NUMERICAL',
   data_type='FLOAT',
   detected_feature_type='NUMERICAL',
   detected_data_type=None,
   columns=None,
   point_in_time_info=None),
 Feature(name='water',
   select_clause=None,
   feature_mapping=None,
   source_table='csabaconcrete_transform_fg',
   original_name=None,
   using_clause=None,
   order_clause=None,
   where_clause=None,
   feature_type='NUMERICAL',
   data_type='FLOAT',
   de

# Not only Python Functions
With Feature Groups, we can also execute SQL queries. Here we split our data between train and test using SQL

In [30]:
all_feature_but_traintest = [
    feature.name
    for feature in data_split_fg.refresh().features
    if feature.name != "TRAINTEST"
]

In [31]:
concrete_training_data_fg_name = unique_prefix + "concrete_training_data_fg"
train_sql = f"SELECT {', '.join(all_feature_but_traintest)} FROM {data_split_fg.table_name} WHERE TRAINTEST = 'TRAIN'"
try:
    concrete_training_data_fg = client.create_feature_group(
        concrete_training_data_fg_name, sql=train_sql
    )
except ApiException as e:
    print(e)
    concrete_training_data_fg = client.describe_feature_group_by_table_name(
        concrete_training_data_fg_name
    )
try:
    concrete_training_data_fg.add_to_project(feature_store_project.id)
except ApiException as e:
    print(e)
concrete_training_data_fg.add_tag("training")
concrete_training_data_fg.add_tag("sql")

In [32]:
concrete_predicting_data_fg_name = unique_prefix + "concrete_predicting_data_fg"
train_sql = f"SELECT {', '.join(all_feature_but_traintest)} FROM {data_split_fg.table_name} WHERE TRAINTEST = 'TEST'"
try:
    concrete_predicting_data_fg = client.create_feature_group(
        concrete_predicting_data_fg_name, sql=train_sql
    )
except ApiException as e:
    print(e)
    concrete_predicting_data_fg = client.describe_feature_group_by_table_name(
        concrete_predicting_data_fg_name
    )
try:
    concrete_predicting_data_fg.add_to_project(feature_store_project.id)
except ApiException as e:
    print(e)
concrete_predicting_data_fg.add_tag("predicting")
concrete_predicting_data_fg.add_tag("sql")

# Abacus BYOA - Run your model alongside Abacus's world class algorithms
With a handle on training a custom model in an Abacus notebook, we can now register a algorithm to run concurrently with Abacus' own algorithms. This allows us to make quick comparisons across models and helps control train and test splits.

First we create a common Predictive Modeling project

In [33]:
byoa_project_name = "byoa_proj"
# assert byoa_project_name, "Please provide a name for your BYOA Project"

byoa_project_exists = False
use_case = "PREDICTING"
byoa_project = None

for existing_project in client.list_projects():
    if byoa_project_name == existing_project.name:
        byoa_project_exists = True
        byoa_project = existing_project

if not byoa_project_exists:
    byoa_project = client.create_project(byoa_project_name, use_case)
byoa_project

Project(project_id='2f89d014b',
  name='byoa_proj',
  use_case='PREDICTING',
  problem_type='REGRESSION',
  created_at='2023-03-07T18:12:24+00:00',
  feature_groups_enabled=True)

### Add the data split feature group to the project
We also describe the requirements for the Predictive Modeling use case

In [34]:
try:
    data_split_fg.add_to_project(byoa_project, feature_group_type="CUSTOM_TABLE")
except ApiException as e:
    print(e)

In [35]:
client.describe_use_case_requirements(byoa_project.use_case)

[UseCaseRequirements(dataset_type='CUSTOM_TABLE',
   name='Custom Table',
   description='This dataset corresponds to any attributes used to predict the target. For example, predicting housing price based on locality, utilities, area of house, etc.',
   required=True,
   multi=False,
   allowed_feature_mappings={'TARGET': {'description': 'Target variable', 'allowed_feature_types': ['CATEGORICAL', 'NUMERICAL', 'MULTIVALUECATEGORICAL'], 'required': True}, 'IGNORE': {'description': 'Ignore this column in training', 'multiple': True, 'required': False}},
   allowed_nested_feature_mappings={'IGNORE': {'description': 'Ignore this column in training', 'multiple': True, 'required': False}}),
 UseCaseRequirements(dataset_type='PREDICTION_METRICS_INPUT',
   name='Prediction Metrics Input',
   description='Prediction metrics input feature group',
   required=False,
   multi=False,
   allowed_feature_mappings={'ACTUAL': {'description': 'Actual value for the prediction input', 'required': True}, 'P

### With feature mapping requirements known, we set a target

In [36]:
byoa_project.set_feature_mapping(
    data_split_fg.id, feature_name="csMPa", feature_mapping="TARGET"
)

[Feature(name='cement',
   select_clause=None,
   feature_mapping=None,
   source_table='csabaconcrete_transform_fg',
   original_name=None,
   using_clause=None,
   order_clause=None,
   where_clause=None,
   feature_type='NUMERICAL',
   data_type='FLOAT',
   detected_feature_type='NUMERICAL',
   detected_data_type=None,
   columns=None,
   point_in_time_info=None),
 Feature(name='slag',
   select_clause=None,
   feature_mapping=None,
   source_table='csabaconcrete_transform_fg',
   original_name=None,
   using_clause=None,
   order_clause=None,
   where_clause=None,
   feature_type='NUMERICAL',
   data_type='FLOAT',
   detected_feature_type='NUMERICAL',
   detected_data_type=None,
   columns=None,
   point_in_time_info=None),
 Feature(name='water',
   select_clause=None,
   feature_mapping=None,
   source_table='csabaconcrete_transform_fg',
   original_name=None,
   using_clause=None,
   order_clause=None,
   where_clause=None,
   feature_type='NUMERICAL',
   data_type='FLOAT',
   de

### Validate the project and feature groups to ensure we're ready to train a model

In [37]:
byoa_project.validate(feature_group_ids=[data_split_fg.id])

ProjectValidation(valid=True,
  dataset_errors=[],
  column_hints={})

### The client is self documenting, we can view the possible training options

In [38]:
client.get_training_config_options(byoa_project.id, [data_split_fg.id])

[TrainingConfigOptions(name='GROUP_DESCRIPTION_DATA_SPLIT',
   data_type='CONSTANT',
   value_type=None,
   value_options=None,
   value=None,
   default='During the process of training a model, we split the data into 2 chunks - Train and Test data. Test data is further split into Test_Val (Test.Test_Val) and Test (Test.Test) data. The following configuration helps you decide what goes into each of these splits',
   options={'showLabel': False, 'newLine': True, 'oneColumn': True},
   description=None,
   required=None,
   last_model_value=None,
   needs_refresh=None),
 TrainingConfigOptions(name='GROUP_DESCRIPTION_FEATURES_AND_COLUMNS',
   data_type='CONSTANT',
   value_type=None,
   value_options=None,
   value=None,
   default='Configure features & columns for training',
   options={'showLabel': False, 'newLine': True, 'oneColumn': True},
   description=None,
   required=None,
   last_model_value=None,
   needs_refresh=None),
 TrainingConfigOptions(name='GROUP_DESCRIPTION_DATA_AUGMEN

## Simulate running on Abacus locally
We leverage specifically designed APIs to replicate how Abacus will pass input parameters to our training function to test locally. We can pass both Abacus and User configurations to our model

In [39]:
training_table_names = [data_split_fg.table_name]
abacus_ai_config = {"TEST_ROW_INDICATOR": "TRAINTEST"}
user_config = {"n_quantiles": 20}
import copy

training_config = copy.deepcopy(abacus_ai_config)
training_config.update({"USER": user_config})
print(training_config)
# Change the value of the parameter names if preferred, and don't forget to change in train_func as well.
training_data_parameter_name_override = {"CUSTOM_TABLE": "training_data"}
training_config_parameter_name_override = "training_config"

{'TEST_ROW_INDICATOR': 'TRAINTEST', 'USER': {'n_quantiles': 20}}


### We manipulate the train function to fit the Abacus framework
And train locally

In [40]:
def byoa_train(training_data, schema_mappings, training_config):
    import numpy as np

    train_test_column = training_config.get("TEST_ROW_INDICATOR", None)
    user_defined_config = training_config.get("USER")
    n_quantiles = user_defined_config.get("n_quantiles")

    # set the seed for reproduceable results
    np.random.seed(5)

    target_columns = schema_mappings["training_data"].get("TARGET", [])
    assert target_columns, "No target column provided"
    ignore_columns = schema_mappings["training_data"].get("IGNORE", [])
    drop_columns = target_columns + ignore_columns
    drop_columns.append(train_test_column)
    X = training_data.drop(drop_columns, axis=1).copy()
    y = training_data[target_columns]
    from sklearn.linear_model import LinearRegression
    from sklearn.preprocessing import QuantileTransformer

    qt = QuantileTransformer(n_quantiles=n_quantiles)
    linear_regresion = LinearRegression()
    linear_regresion.fit(qt.fit_transform(X), y)
    print(f"Linear model R^2 = {linear_regresion.score(qt.transform(X), y)}")

    return (X.columns, target_columns[0], qt, linear_regresion)

In [41]:
# This currently returns the full data for local testing, but on remote training, only the training rows are sent to the train_functin.
train_input = client.get_train_function_input(
    project_id=byoa_project.project_id,
    training_table_names=training_table_names,
    training_data_parameter_name_override=training_data_parameter_name_override,
    training_config_parameter_name_override=training_config_parameter_name_override,
    training_config=training_config,
)
# You can also override with any dataframe for local testing.
model_input_table = data_split_fg.latest_feature_group_version.load_as_pandas()
train_input["training_data"] = model_input_table.loc[
    model_input_table["TRAINTEST"] == "TRAIN"
]
local_model = byoa_train(**train_input)

Linear model R^2 = 0.9114936771485455


### Similarly, we define a predict many function and test

In [42]:
def byoa_predict_many(model, queries):
    columns, target_column, qt, linear_model = model
    import pandas as pd

    result = []
    for query in queries:
        X = pd.DataFrame({c: [query[c]] for c in columns})
        y = linear_model.predict(qt.transform(X))[0]
        result.append({target_column: y[0]})
    return result

In [43]:
prediction_results = byoa_predict_many(
    local_model, concrete_prediction_data.to_dict(orient="records")
)
prediction_results[:5]

[{'csMPa': 37.697797557944256},
 {'csMPa': 47.571887792434},
 {'csMPa': 64.70979460520971},
 {'csMPa': 43.66334359567851},
 {'csMPa': 13.843112761580656}]

# Registering the Algorithm
We can register an algorithm at the project or organization level, but the name must always be all caps and begin with 'USER.'

In [45]:
algorithm_scope = "project"
algorithm_name = "USER.CSABA"
assert algorithm_name.split(".")[1], "Please provide an ALLCAPS name for your algorithm"

In [46]:
try:
    byoa_algorithm = client.create_algorithm_from_function(
        name=algorithm_name,
        problem_type="REGRESSION",
        training_data_parameter_names_mapping=training_data_parameter_name_override,
        training_config_parameter_name=training_config_parameter_name_override,
        train_function=byoa_train,
        predict_many_function=byoa_predict_many,
    )
except ApiException as e:
    print(e)
    byoa_algorithm = client.describe_algorithm(algorithm_name)
byoa_algorithm

Algorithm(name='USER.CSABA',
  problem_type='REGRESSION',
  created_at='2023-03-07T18:24:43+00:00',
  updated_at='2023-03-07T18:24:43+00:00',
  is_default_enabled=False,
  training_input_mappings={'training_config_parameter_name': 'training_config', 'training_data_parameter_names_mapping': {'CUSTOM_TABLE': 'training_data'}},
  train_function_name='byoa_train',
  predict_function_name=None,
  predict_many_function_name='byoa_predict_many',
  initialize_function_name=None,
  config_options=None,
  algorithm_id='USER.CSABA',
  use_gpu=False,
  code_source=CodeSource(source_type='TEXT',
  source_code='def byoa_train(training_data, schema_mappings, training_config):\n    import numpy as np\n\n    train_test_column = training_config.get("TEST_ROW_INDICATOR", None)\n    user_defined_config = training_config.get("USER")\n    n_quantiles = user_defined_config.get("n_quantiles")\n\n    # set the seed for reproduceable results\n    np.random.seed(5)\n\n    target_columns = schema_mappings["traini

### With a registered Algorithm, we train our model, passing Abacus and User configs

In [47]:
import json

feature_group_ids = [
    client.describe_feature_group_by_table_name(table_name).feature_group_id
    for table_name in training_table_names
]
model_w_byoa = client.train_model(
    project_id=byoa_project.id,
    name="BYOA Regresion Model",
    training_config=abacus_ai_config,
    feature_group_ids=feature_group_ids,
    custom_algorithms=[algorithm_name],
    custom_algorithm_configs={algorithm_name: json.dumps(user_config)},
    # change to LARGE for extensive computation
    cpu_size="MEDIUM",
    # max memory is 60GB
    memory=12,
)

In [48]:
import time
# Wait for our specific Algorithm to complete training
while algorithm_name not in [
    algo["name"]
    for algo in model_w_byoa.describe().latest_model_version.deployable_algorithms
]:
    time.sleep(5)
else:
    print(model_w_byoa.describe().latest_model_version.deployable_algorithms)

[{'name': 'Abacus Deep Learning - Best Fit Neural Network', 'algorithm': '90a154592'}, {'name': 'Abacus Deep Learning - Best Fit Neural Network with Feature Selection', 'algorithm': 'd3ff38742'}, {'name': 'Abacus Classical - Decision Trees', 'algorithm': 'f86976002'}, {'name': 'Abacus Classical - Linear Model', 'algorithm': 'eade0a9c8'}, {'name': 'USER.CSABA', 'algorithm': 'USER.CSABA'}]


### We select our algorithm as the default in order to validate predictions
Default algorithms are utilized in our deployment unless overridden

In [49]:
model_w_byoa.set_default_algorithm(algorithm_name)
byoa_deployment_token = client.create_deployment_token(
    byoa_project.id, "BYOA_Deployment_Token"
)
byoa_deployment = client.create_deployment(
    name="BYOA Deployment", model_id=model_w_byoa.id
)
byoa_deployment.wait_for_deployment()

Deployment(deployment_id='4228943f1',
  name='BYOA Deployment',
  status='ACTIVE',
  description='',
  deployed_at='2023-03-07T18:33:31+00:00',
  created_at='2023-03-07T18:32:15+00:00',
  project_id='2f89d014b',
  model_id='5979de452',
  model_version='1bb8a7e33',
  feature_group_id=None,
  feature_group_version=None,
  calls_per_second=5,
  auto_deploy=True,
  algo_name='USER.CSABA',
  regions=[{'name': 'Us East 2', 'value': 'us-east-2'}],
  error=None,
  batch_streaming_updates=False,
  algorithm='USER.CSABA',
  pending_model_version=None,
  model_deployment_config={},
  refresh_schedules=None,
  feature_group_export_config=None)

### With our deployment we once again verify parity across the local predictions, the Abacus prediction API and Abacus Batch Prediction
This verifies are models are all behaving the same

In [50]:
prediction_results = byoa_predict_many(
    local_model, concrete_prediction_data.to_dict(orient="records")
)
prediction_results[:5]

[{'csMPa': 37.697797557944256},
 {'csMPa': 47.571887792434},
 {'csMPa': 64.70979460520971},
 {'csMPa': 43.66334359567851},
 {'csMPa': 13.843112761580656}]

In [51]:
import json

for record in concrete_prediction_data.to_dict(orient="records")[:5]:
    print(
        client.predict(
            deployment_token=byoa_deployment_token,
            deployment_id=byoa_deployment.id,
            query_data=json.dumps(record),
        )
    )

{'csMPa': 37.697797557944256}
{'csMPa': 47.571887792434}
{'csMPa': 64.70979460520971}
{'csMPa': 43.66334359567851}
{'csMPa': 13.843112761580656}


In [52]:
byoa_batch_prediction = byoa_deployment.create_batch_prediction(
    table_name="byoa_evaluation_output",
    name="BYOA Evaluation Prediction",
    global_prediction_args={"forEval": True},
    output_includes_metadata=True
)
byoa_batch_prediction_run = byoa_batch_prediction.start()
byoa_batch_prediction_run.wait_for_predictions()

BatchPredictionVersion(batch_prediction_version='665b37a88',
  batch_prediction_id='d3db6f4c5',
  status='COMPLETE',
  drift_monitor_status=None,
  deployment_id='4228943f1',
  model_id='ae84be9e0',
  model_version='70c3883c1',
  predictions_started_at='2023-03-07T18:34:37+00:00',
  predictions_completed_at='2023-03-07T18:36:05+00:00',
  global_prediction_args={'forEval': True},
  database_output_error=False,
  total_predictions=103,
  failed_predictions=0,
  database_connector_id=None,
  database_output_configuration=None,
  explanations=False,
  file_connector_output_location=None,
  file_output_format='csv',
  connector_type=None,
  legacy_input_location=None,
  error=None,
  drift_monitor_error=None,
  csv_input_prefix='',
  csv_prediction_prefix='prediction_',
  csv_explanations_prefix='explanation_',
  database_output_total_writes=None,
  database_output_failed_writes=None,
  output_includes_metadata=True,
  result_input_columns=None,
  model_monitor_version='54853220e',
  algo_n

In [53]:
byoa_eval_output_fg = client.describe_feature_group_by_table_name(byoa_batch_prediction.feature_group_table_name)
byoa_eval_output_fg.wait_for_materialization().latest_feature_group_version.load_as_pandas()['prediction_csMPa'][:5]

0    37.697798
1    47.571888
2    64.709795
3    43.663344
4    13.843113
Name: prediction_csMPa, dtype: float64

# Congratulations
You now can take a locally train a model, import it to Abacus as an independent, and import it to Abacus to run alongside our state of the art algorithms, competing against our Advanced AI systems.

In [54]:
[{'csMPa': 37.697797557944256},
 {'csMPa': 47.571887792434},
 {'csMPa': 64.70979460520971},
 {'csMPa': 43.66334359567851},
 {'csMPa': 13.843112761580656}]

[{'csMPa': 37.697797557944256},
 {'csMPa': 47.571887792434},
 {'csMPa': 64.70979460520971},
 {'csMPa': 43.66334359567851},
 {'csMPa': 13.843112761580656}]