In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from ydata_profiling import ProfileReport

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set print options
np.set_printoptions(suppress=True, precision=4, edgeitems = 7)
pd.options.display.float_format = '{:.4f}'.format
pd.set_option('display.max_columns', None)

In [3]:
# Read data
df = pd.read_csv("./InputData/full_data.csv")

In [4]:
# Enforce variable types
df["market_id"] = df["market_id"].astype(str)
df["store_id"] = df["store_id"].astype(str)
df["order_protocol"] = df["order_protocol"].astype(str)

In [5]:
# Drop non-feature variables
df = df.drop(['created_at', 'actual_delivery_time'], axis = 1)

In [6]:
df.dtypes

market_id                                        object
store_id                                         object
store_primary_category                           object
order_protocol                                   object
total_items                                       int64
subtotal                                          int64
num_distinct_items                                int64
min_item_price                                    int64
max_item_price                                    int64
total_onshift_dashers                           float64
total_busy_dashers                              float64
total_outstanding_orders                        float64
estimated_order_place_duration                    int64
estimated_store_to_consumer_driving_duration    float64
weekday_0                                         int64
weekday_1                                         int64
weekday_2                                         int64
weekday_3                                       

# Automated

In [7]:
# Generate EDA report
df_report = df.drop([
    'weekday_0', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
    'weekday_6', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
    'superbowl', 'valentines'], axis = 1)

profile = ProfileReport(
    df_report,
    type_schema = {
        "market_id": "categorical",
        "store_id": "categorical",
        "store_primary_category": "categorical",
        "order_protocol": "categorical",
        
    }, 
    vars = {
        "num": {"low_categorical_threshold": 0},
        "cat": {"length": False}
    },
    missing_diagrams = None,
    correlations={
        "auto": {"threshold": 0.6}
    },
    interactions = {"targets": ["duration"]}
 )

In [8]:
profile.to_file("EDAReport.html")

Summarize dataset: 100%|████████████████████████████████████████████████████| 39/39 [00:02<00:00, 13.35it/s, Completed]
Generate report structure: 100%|█████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.24s/it]
Render HTML: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.47it/s]
Export report to file: 100%|████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 333.44it/s]


# Manual