In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

In [2]:
dir_int = os.path.abspath('../data/interim')
final_csv = os.path.join(dir_int, 'block_group_final.csv')

gdb_int = os.path.join(dir_int, 'interim.gdb')
final_fc = os.path.join(gdb_int, 'block_group_final')

In [3]:
final_df = pd.read_csv(final_csv)
final_df.drop('SHAPE', inplace=True, axis=1)
final_df.sample(5)

Unnamed: 0,origin_id,gender_pop0_cy,gender_pop5_cy,gender_pop10_cy,gender_pop15_cy,gender_pop20_cy,gender_pop25_cy,gender_pop30_cy,gender_pop35_cy,gender_pop40_cy,...,trip_time_minutes_OTHER_03,trip_time_minutes_STARBUCKS_01,trip_time_minutes_STARBUCKS_02,trip_time_minutes_STARBUCKS_03,trip_time_minutes_STUMPTOWN_COFFEE_ROASTERS_01,trip_time_minutes_STUMPTOWN_COFFEE_ROASTERS_02,trip_time_minutes_STUMPTOWN_COFFEE_ROASTERS_03,trip_time_minutes_UNCLE_GARYS_COFFEE_EMPORIUM_01,trip_time_minutes_UNCLE_GARYS_COFFEE_EMPORIUM_02,trip_time_minutes_UNCLE_GARYS_COFFEE_EMPORIUM_03
1916,410050209001,48.0,51.0,56.0,49.0,52.0,66.0,72.0,82.0,80.0,...,17.0,10.566667,15.258333,23.966667,17.983333,15.083333,,18.45,,
6883,530719201005,32.0,45.0,57.0,60.0,39.0,29.0,37.0,55.0,48.0,...,,,,,,,,,,
1740,410670325011,114.0,117.0,105.0,121.0,248.0,259.0,231.0,215.0,191.0,...,40.4,29.191667,30.533333,57.516667,49.183333,,,39.433333,36.316667,
1964,410050221052,150.0,165.0,173.0,166.0,144.0,183.0,183.0,161.0,164.0,...,27.025,18.2,35.966667,46.416667,69.683333,26.283333,,28.033333,6.2,10.116667
2381,410510073001,29.0,27.0,27.0,70.0,221.0,273.0,250.0,224.0,225.0,...,34.05,37.241667,24.516667,27.733333,26.65,20.75,25.066667,27.4,34.933333,26.766667


In [4]:
X = final_df
label_col = 'market_penetration_UNCLE_GARYS_COFFEE_EMPORIUM_01'

# while possibly useful for post analysis, the destination id columns are not needed for modeling
start_df = X[[col for col in X.columns if not col.startswith('dest_id') and col is not 'SHAPE']]

# only keep rows with values for our label
start_df = start_df[start_df[label_col].notnull()].copy()

# extract out the labels and factors into X and y, respectively
y = start_df[label_col]
X = start_df[[col for col in start_df if col is not label_col]]

prep_pipe = Pipeline([
    ('impute_mean', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scaler', MinMaxScaler())
])

X = prep_pipe.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from tpot import TPOTRegressor

tpot = TPOTRegressor(verbosity=3)
tpot

TPOTRegressor(config_dict=None, crossover_rate=0.1, cv=5,
       disable_update_check=False, early_stop=None, generations=100,
       max_eval_time_mins=5, max_time_mins=None, memory=None,
       mutation_rate=0.9, n_jobs=1, offspring_size=None,
       periodic_checkpoint_folder=None, population_size=100,
       random_state=None, scoring=None, subsample=1.0, use_dask=False,
       verbosity=3, warm_start=False)

In [None]:
tpot.fit(X_train, y_train)

29 operators have been imported by TPOT.


In [None]:
print(tpot.score(X_test, y_test))

In [None]:
tpot.export('tpot_pdx_coffee_pipeline.py')