# Explore the input data and build a model

In [None]:
import copy
import cPickle as pickle
from datetime import datetime
from IPython.display import display
import logging
from matplotlib.dates import MonthLocator, WeekdayLocator, DateFormatter
import matplotlib.pyplot as plt
import multiprocessing
import numpy as np
import pandas as pd
import psycopg2
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
import sys
import time

plt.style.use('ggplot')
%matplotlib inline

In [None]:
# Show all columns and rows.
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Set up logging

In [None]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(process)d/%(threadName)s - %(name)s - %(levelname)s - %(message)s',
                    stream=sys.stdout)
logger = logging.getLogger('main()')

In [None]:
epoch = int(time.time())
epoch

In [None]:
str_file_csv = 'historical_data.csv'
df_csv = pd.read_csv('../../../data/input/' + str_file_csv,
                     parse_dates=['created_at',
                                  'actual_delivery_time'])
df_csv.head()

In [None]:
df_csv.shape

### Look at histograms

In [None]:
df_csv['market_id'].value_counts().sort_index()

In [None]:
df_csv['store_id'].value_counts().sort_index()

In [None]:
df_csv['store_primary_category'].value_counts().sort_index()

In [None]:
df_csv['order_protocol'].value_counts().sort_index()

### Make sure that datetimes are parsed correctly

In [None]:
type(df_csv['created_at'][0])

In [None]:
type(df_csv['actual_delivery_time'][0])

### Look for NaNs

In [None]:
df_tmp = df_csv.isnull().any()
df_tmp[ df_tmp==True ]

## Drop rows where the outcome variable cannot be computed.
"The target value to predict here is the total seconds value between `created_at` and `actual_delivery_time`"

In [None]:
df_csv = df_csv[ ~df_csv['created_at'].isnull() ]
df_csv = df_csv[ ~df_csv['actual_delivery_time'].isnull() ]

In [None]:
df_csv.shape

### Look for outliers

In [None]:
cols_boxplot = df_csv.columns.values.tolist()

cols_boxplot.remove('created_at')
cols_boxplot.remove('actual_delivery_time')
cols_boxplot.remove('store_id')
cols_boxplot.remove('store_primary_category')

In [None]:
for col in cols_boxplot:
    plt.figure()
    df_csv.boxplot(column=col)

### Fill NaNs
* Use median if the column is continuous
* Use mode if the column is categorical
* Keep track of the medians and mode of all numerical columns because we will use those numbers to fill NaNs in production
* The set of features should be `cols_categorical` + `cols_cont` + one-hot-encoded `store_primary_category`

In [None]:
cols_categorical = ['market_id',
                    'order_protocol']

In [None]:
df_csv[cols_categorical].mode()

In [None]:
cols_cont = ['total_items',
              'subtotal',
              'num_distinct_items',
              'min_item_price',
              'max_item_price',
              'total_onshift_dashers',
              'total_busy_dashers',
              'total_outstanding_orders',
              'estimated_order_place_duration',
              'estimated_store_to_consumer_driving_duration']

In [None]:
df_csv[cols_cont].median()

In [None]:
for col in cols_categorical:
    df_csv[col].fillna(df_csv[col].mode()[0], inplace=True) 

In [None]:
for col in cols_cont:
    df_csv[col].fillna(df_csv[col].median(), inplace=True) 

In [None]:
df_csv[['store_primary_category']] = df_csv[['store_primary_category']].fillna(value='unknown')

In [None]:
# Check for NaNs again.
df_tmp = df_csv.isnull().any()
df_tmp[ df_tmp==True ]

### One-hot-encode certain categorical columns

### Compute outcome variable

In [None]:
col_outcome = 'outcome_total_time'
df_csv[col_outcome] = ( df_csv['actual_delivery_time'] - df_csv['created_at'] ) / np.timedelta64(1, 's')
df_csv.head()

In [None]:
df_csv.boxplot(column=[col_outcome])

### Replace outliers with median

In [None]:
df_csv[col_outcome].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.90, 0.95, 0.99])

In [None]:
df_csv.loc[ df_csv[col_outcome] > 1e4, col_outcome] = np.nan
df_csv[col_outcome].fillna(df_csv[col_outcome].median(), inplace=True)

In [None]:
df_csv.boxplot(column=[col_outcome])

## Split dataset into training and test

### Train and pickle the model