# Census benchmark
## ML workflow

### The goal is to measure the total execution time: [Workflow execution cell](#execution_cell)

### Dataset link:
### `https://rapidsai-data.s3.us-east-2.amazonaws.com/datasets/ipums_education2income_1970-2010.csv.gz`

### Competition link:

In [1]:
import time
from timeit import default_timer as timer

import pandas as pd
from collections import OrderedDict
from dataclasses import dataclass
import typing
from flytekit import Resources, task, workflow, dynamic
from flytekit.types.file import FlyteFile
from flytekit.types.schema import FlyteSchema
import numpy as np
import sklearn.linear_model as lm
from sklearn.model_selection import train_test_split
from sklearn import config_context

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
import logging
from flytekit.loggers import logger

logger.setLevel(level=logging.WARN)
logger.getEffectiveLevel



### Common part: global variables and functions which don't require @task coverage

In [4]:
def split(X, y, test_size=0.1, stratify=None, random_state=None):
    t0 = timer()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=stratify, random_state=random_state
    )
    split_time = timer() - t0

    return (X_train, y_train, X_test, y_test), split_time

In [5]:
DATASET_PATH = "https://modin-datasets.s3.amazonaws.com/census/ipums_education2income_1970-2010.csv.gz"

COLS = [
    "YEAR",
    "DATANUM",
    "SERIAL",
    "CBSERIAL",
    "HHWT",
    "CPI99",
    "GQ",
    "PERNUM",
    "SEX",
    "AGE",
    "INCTOT",
    "EDUC",
    "EDUCD",
    "EDUC_HEAD",
    "EDUC_POP",
    "EDUC_MOM",
    "EDUCD_MOM2",
    "EDUCD_POP2",
    "INCTOT_MOM",
    "INCTOT_POP",
    "INCTOT_MOM2",
    "INCTOT_POP2",
    "INCTOT_HEAD",
    "SEX_HEAD",
]

COLUMNS_TYPES = [
    "int",
    "int",
    "int",
    "float",
    "int",
    "float",
    "int",
    "float",
    "int",
    "int",
    "int",
    "int",
    "int",
    "int",
    "int",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
    "float",
]

# X = OrderedDict((zip(COLS, list(map(eval, COLUMNS_TYPES)))))
# Y = OrderedDict({"EDUC": X.pop("EDUC")})
# Y

In [6]:
# ML specific
N_RUNS = 50
TEST_SIZE = 0.1
RANDOM_STATE = 777

ML_KEYS = ["t_train_test_split", "t_train", "t_inference", "t_ml"]
ML_SCORE_KEYS = ["mse_mean", "cod_mean", "mse_dev"]

## Workflow consisting from 2 tasks

In [7]:
# utils


def mse(y_test, y_pred):
    return ((y_test - y_pred) ** 2).mean()


def cod(y_test, y_pred):
    y_bar = y_test.mean()
    total = ((y_test - y_bar) ** 2).sum()
    residuals = ((y_test - y_pred) ** 2).sum()
    return 1 - (residuals / total)

In [8]:
@task
def feature_eng_task(
    data: FlyteFile[typing.TypeVar("csv")], cols: typing.List[str]
) -> (pd.DataFrame):

    df = pd.read_csv(data)[cols]

    df = df[df["INCTOT"] != 9999999]
    df = df[df["EDUC"] != -1]
    df = df[df["EDUCD"] != -1]

    df["INCTOT"] = df["INCTOT"] * df["CPI99"]

    for column in cols:
        df[column] = df[column].fillna(-1)
        df[column] = df[column].astype("float64")

    return df

In [9]:
@task
def ml_task(
    df: pd.DataFrame,
    random_state: int,
    n_runs: int,
    test_size: float,
    ml_keys: typing.List[str],
    ml_score_keys: typing.List[str],
) -> (typing.Dict[str, float], typing.Dict[str, float]):

    # Fetch the input and output data from train dataset
    y = np.ascontiguousarray(df["EDUC"], dtype=np.float64)
    X = np.ascontiguousarray(df.drop(columns=["EDUC", "CPI99"]), dtype=np.float64)

    clf = lm.Ridge()

    mse_values, cod_values = [], []
    ml_times = {key: 0.0 for key in ml_keys}
    ml_scores = {key: 0.0 for key in ml_score_keys}

    print("ML runs: ", n_runs)
    for i in range(n_runs):
        (X_train, y_train, X_test, y_test), split_time = split(
            X, y, test_size=test_size, random_state=random_state
        )
        ml_times["t_train_test_split"] += split_time
        random_state += 777

        t0 = timer()
        with config_context(assume_finite=True):
            model = clf.fit(X_train, y_train)
        ml_times["t_train"] += timer() - t0

        t0 = timer()
        y_pred = model.predict(X_test)
        ml_times["t_inference"] += timer() - t0

        mse_values.append(mse(y_test, y_pred))
        cod_values.append(cod(y_test, y_pred))

    ml_times["t_ml"] += ml_times["t_train"] + ml_times["t_inference"]

    ml_scores["mse_mean"] = sum(mse_values) / len(mse_values)
    ml_scores["cod_mean"] = sum(cod_values) / len(cod_values)
    ml_scores["mse_dev"] = pow(
        sum([(mse_value - ml_scores["mse_mean"]) ** 2 for mse_value in mse_values])
        / (len(mse_values) - 1),
        0.5,
    )
    ml_scores["cod_dev"] = pow(
        sum([(cod_value - ml_scores["cod_mean"]) ** 2 for cod_value in cod_values])
        / (len(cod_values) - 1),
        0.5,
    )

    return ml_scores, ml_times

In [14]:
@workflow
def census_bench_wf(
    dataset: FlyteFile["csv"] = DATASET_PATH,
    cols: typing.List[str] = COLS,
    random_state: int = RANDOM_STATE,
    n_runs: int = N_RUNS,
    test_size: float = TEST_SIZE,
    ml_keys: typing.List[str] = ML_KEYS,
    ml_score_keys: typing.List[str] = ML_SCORE_KEYS,
) -> (typing.Dict[str, float], typing.Dict[str, float]):
    df = feature_eng_task(data=dataset, cols=cols)
    ml_scores, ml_times = ml_task(
        df=df,
        random_state=random_state,
        n_runs=n_runs,
        test_size=test_size,
        ml_keys=ml_keys,
        ml_score_keys=ml_score_keys,
    )
    return ml_scores, ml_times

In [15]:
%%time
census_bench_wf()

ML runs:  50
CPU times: user 7min 11s, sys: 6min 43s, total: 13min 54s
Wall time: 6min 58s


DefaultNamedTupleOutput(o0={'mse_mean': 0.03256456908804994, 'cod_mean': 0.9953675334603814, 'mse_dev': 4.179940420229173e-05, 'cod_dev': 5.869227912341005e-06}, o1={'t_train_test_split': 140.56967029813677, 't_train': 143.43539352389053, 't_inference': 2.4777097539044917, 't_ml': 145.91310327779502})

## Workflow consisting from more detalized tasks

In [7]:
FEATURES = OrderedDict((zip(COLS, list(map(eval, COLUMNS_TYPES)))))
TARGET = OrderedDict({"EDUC": FEATURES.pop("EDUC")})

In [8]:
# utils

@task
def mse(y_test: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    return ((y_test - y_pred) ** 2).mean()

In [9]:
@task
def cod(y_test: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    y_bar = y_test.mean()
    total = ((y_test - y_bar) ** 2).sum()
    residuals = ((y_test - y_pred) ** 2).sum()
    return 1 - (residuals / total)

In [10]:
@task
def feature_eng_task(
    data: FlyteFile[typing.TypeVar("csv")], cols: typing.List[str]
) -> (pd.DataFrame):

    df = pd.read_csv(data)[cols]

    df = df[df["INCTOT"] != 9999999]
    df = df[df["EDUC"] != -1]
    df = df[df["EDUCD"] != -1]

    df["INCTOT"] = df["INCTOT"] * df["CPI99"]

    for column in cols:
        df[column] = df[column].fillna(-1)
        df[column] = df[column].astype("float64")

    return df

In [11]:
@dynamic
def ml_task(
    df: pd.DataFrame,
    random_state: int,
    n_runs: int,
    test_size: float,
    ml_keys: typing.List[str],
    ml_score_keys: typing.List[str],
) -> (typing.Dict[str, float], typing.Dict[str, float]):

    # Fetch the input and output data from train dataset
    #     y = np.ascontiguousarray(df["EDUC"], dtype=np.float64)
    #     X = np.ascontiguousarray(df.drop(columns=["EDUC", "CPI99"]), dtype=np.float64)
    y = df["EDUC"]
    X = df.drop(columns=["EDUC", "CPI99"])

    clf = lm.Ridge()

    mse_values, cod_values = [], []
    ml_times = {key: 0.0 for key in ml_keys}
    ml_scores = {key: 0.0 for key in ml_score_keys}

    print("ML runs: ", n_runs)
    for i in range(n_runs):
        (X_train, y_train, X_test, y_test), split_time = split(X=X, y=y)
        y_test = pd.DataFrame({"EDUC": y_test})
        ml_times["t_train_test_split"] += split_time
        random_state += 777

        t0 = timer()
        with config_context(assume_finite=True):
            model = clf.fit(X_train, y_train)
        ml_times["t_train"] += timer() - t0

        t0 = timer()
        y_pred = pd.DataFrame({"EDUC": model.predict(X_test)})
        ml_times["t_inference"] += timer() - t0

        mse_values.append(mse(y_test=y_test, y_pred=y_pred))
        cod_values.append(cod(y_test=y_test, y_pred=y_pred))

    ml_times["t_ml"] += ml_times["t_train"] + ml_times["t_inference"]

    ml_scores["mse_mean"] = sum(mse_values) / len(mse_values)
    ml_scores["cod_mean"] = sum(cod_values) / len(cod_values)
    ml_scores["mse_dev"] = pow(
        sum([(mse_value - ml_scores["mse_mean"]) ** 2 for mse_value in mse_values])
        / (len(mse_values) - 1),
        0.5,
    )
    ml_scores["cod_dev"] = pow(
        sum([(cod_value - ml_scores["cod_mean"]) ** 2 for cod_value in cod_values])
        / (len(cod_values) - 1),
        0.5,
    )

    return ml_scores, ml_times

In [12]:
@workflow
def census_bench_wf(
    dataset: FlyteFile["csv"] = DATASET_PATH,
    cols: typing.List[str] = COLS,
    random_state: int = RANDOM_STATE,
    n_runs: int = N_RUNS,
    test_size: float = TEST_SIZE,
    ml_keys: typing.List[str] = ML_KEYS,
    ml_score_keys: typing.List[str] = ML_SCORE_KEYS,
) -> (typing.Dict[str, float], typing.Dict[str, float]):
    df = feature_eng_task(data=dataset, cols=cols)
    ml_scores, ml_times = ml_task(
        df=df,
        random_state=random_state,
        n_runs=n_runs,
        test_size=test_size,
        ml_keys=ml_keys,
        ml_score_keys=ml_score_keys,
    )
    return ml_scores, ml_times

In [13]:
%%time

#  how workflow output looks like if ml function is decorated as @dynamic
census_bench_wf()

ML runs:  50
CPU times: user 10min 17s, sys: 5min 55s, total: 16min 13s
Wall time: 11min 44s


DefaultNamedTupleOutput(o0={'mse_mean': EDUC    15.160786
dtype: float64, 'cod_mean': EDUC    0.842082
dtype: float64, 'mse_dev': EDUC    0.053432
dtype: float64, 'cod_dev': EDUC    0.000722
dtype: float64}, o1={'t_train_test_split': 361.7690731417388, 't_train': 105.59981523500755, 't_inference': 3.1282578515820205, 't_ml': 108.72807308658957})