In [5]:
import time
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [3]:
%%time

df = pd.read_csv(
    "https://storage.googleapis.com/gcpdatascience-tribal-bird-305118/dunnhumby/training_data_psuedo.csv",
     dtype={
        'TARGET': 'int32',
        'HOUSEHOLD_KEY': 'float32',
        'COUPONS_REDEEMED': 'int32',
        'NUMBER_OF_CAMPAIGNS': 'int32',
        'COUPON_DAYS': 'int32',
        'FIRST_COUPON_DAY': 'int32',
        'LAST_COUPON_DAY': 'int32',
        'CAMPAIGNS_RECEIVED_REDEEMED_RATIO': 'float32',
        'NUMBER_TYPE_A': 'int32',
        'NUMBER_TYPE_B': 'int32',
        'NUMBER_TYPE_C': 'int32',
        'AGE_DESC': 'category',
        'MARITAL_STATUS_CODE': 'category',
        'INCOME_DESC': 'category',
        'HOMEOWNER_DESC': 'category',
        'HH_COMP_DESC': 'category',
        'HOUSEHOLD_SIZE_DESC': 'category',
        'KID_CATEGORY_DESC': 'category',
        'NUMBER_OF_PURCHASES': 'int32',
        'TOTAL_NUMBER_OF_PRODUCTS': 'int32' , 
        'AVG_PRODUCTS_PER_BASKET': 'float32',
        'TOTAL_SPEND': 'int32',
        'STD_SPEND': 'float32',
        'AVG_SPEND_PER_BASKET': 'float32',
        'AVG_SPEND_PER_PRODUCT': 'float32',
        'MOST_EXPENSIVE_PURCHASE':'int32',
        'AVG_DISCOUNT': 'float32',
        'BIGGEST_DISCOUNT': 'int32',
        'CUSTOMER_AGE': 'int32',
        'NUMBER_OF_WEEKS': 'int32',
        'NUMBER_UNIQUE_DAYS': 'int32',
        'AVG_GAP_IN_TRIPS': 'float32',
        'STD_GAPS_IN_TRIPS': 'float32',
        'MAX_GAPS_IN_TRIPS': 'float32'
    }
).sample(frac=0.3)

print(f"Num rows: {len(df)}, Size: {df.memory_usage(deep=True).sum() / 1e6} MB")

print(df.dtypes)

Num rows: 2100750, Size: 258.396047 MB
TARGET                                  int32
HOUSEHOLD_KEY                         float32
COUPONS_REDEEMED                        int32
NUMBER_OF_CAMPAIGNS                     int32
COUPON_DAYS                             int32
FIRST_COUPON_DAY                        int32
LAST_COUPON_DAY                         int32
CAMPAIGNS_RECEIVED_REDEEMED_RATIO     float32
NUMBER_TYPE_A                           int32
NUMBER_TYPE_B                           int32
NUMBER_TYPE_C                           int32
AGE_DESC                             category
MARITAL_STATUS_CODE                  category
INCOME_DESC                          category
HOMEOWNER_DESC                       category
HH_COMP_DESC                         category
HOUSEHOLD_SIZE_DESC                  category
KID_CATEGORY_DESC                    category
NUMBER_OF_PURCHASES                     int32
TOTAL_NUMBER_OF_PRODUCTS                int32
AVG_PRODUCTS_PER_BASKET               flo

In [6]:
features = [col for col in df.columns if col not in ["TARGET", "HOUSEHOLD_KEY"]]
label = "TARGET"

# create column transformer
numeric_features = df[features].select_dtypes(include=["int32", "float32"]).columns
categorical_features = df[features].select_dtypes(include=["category"]).columns

## train test split
X_train, X_test, y_train, y_test = train_test_split(
    df[features], df[label], test_size=0.25, random_state=2,
    shuffle=True
)

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer =  OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)


lr = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("LR", LogisticRegression(solver="liblinear", class_weight="balanced", verbose=10)),
    ]
)


rf = Pipeline(
       steps=[
         ("preprocessor", preprocessor),
         ("RF", RandomForestClassifier(random_state=2, class_weight="balanced", verbose=10)),
       ]
)


xgb = Pipeline(
        steps=[
         ("preprocessor", preprocessor),
         ("XGB", GradientBoostingClassifier(random_state=2, verbose=10)),
                ]
            )

pipelines = [lr, rf, xgb]

## SK Learn Pipeline

In [7]:
%%time 

start = time.perf_counter()
for pipe in pipelines:
    pipe.fit(X_train, y_train)

end = time.perf_counter()
pipeline_timed_fit = end - start

[LibLinear]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.9s remaining:    0.0s


building tree 2 of 100


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   18.3s remaining:    0.0s


building tree 3 of 100


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   27.6s remaining:    0.0s


building tree 4 of 100


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   36.6s remaining:    0.0s


building tree 5 of 100


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   45.9s remaining:    0.0s


building tree 6 of 100


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   55.4s remaining:    0.0s


building tree 7 of 100


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.1min remaining:    0.0s


building tree 8 of 100


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.2min remaining:    0.0s


building tree 9 of 100


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  1.4min remaining:    0.0s


building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 12.7min finished


      Iter       Train Loss   Remaining Time 
         1           1.2421           19.86m
         2           1.1888           19.77m
         3           1.1437           19.54m
         4           1.1048           19.33m
         5           1.0712           19.09m
         6           1.0406           18.88m
        16           0.8570           16.90m
        17           0.8456           16.70m
        18           0.8346           16.51m
        19           0.8244           16.31m
        20           0.8152           16.10m
        21           0.8058           15.91m
        22           0.7968           15.70m
        23           0.7888           15.50m
        24           0.7804           15.29m
        25           0.7725           15.09m
        26           0.7653           14.90m
        27           0.7578           14.73m
        28           0.7513           14.54m
        29           0.7448           14.35m
        30           0.7384           14.14m
        3