In [46]:
import pandas as pd
import numpy as np
import csv

In [47]:
# Load data from CSVs

# for whatever reason, 'latin-1' needs to be used as the encoding on these files,
# otherwise a Unicode error is thrown
default_encoding = 'latin-1'

PARKED_CARTS_DF = pd.read_csv('data/18f_parked_carts.csv', encoding=default_encoding)
PARKED_ITEMS_DF = pd.read_csv('data/18f_parked_items.csv', encoding=default_encoding)
CHECKOUTS_DF = pd.read_csv('data/18f_checkouts.csv', encoding=default_encoding)

# TODO: what about 18f_checkout_items2.csv ?
CHECKOUT_ITEMS_DF = pd.read_csv('data/18f_checkout_items.csv', encoding=default_encoding)


In [48]:
# Total number of parked items
len(PARKED_ITEMS_DF)

320747

In [49]:
# Total number of checkout items
len(CHECKOUT_ITEMS_DF)

419671

In [50]:
# Display the columns in CHECKOUT_ITEMS_DF
list(CHECKOUT_ITEMS_DF)

['sessionid',
 'user_name',
 'action_time',
 'agency',
 'event_type',
 'order_session_number',
 'bpa_number',
 'ct',
 'mfr',
 'part',
 'product_name',
 'search_query',
 'unit_price',
 'quantity',
 'uom',
 'payment_type']

In [51]:
# Display the columns in PARKED_ITEMS_DF
list(PARKED_ITEMS_DF)

['park_cart_id',
 'bpa_number',
 'ct',
 'event_type',
 'mfr',
 'part',
 'product_name',
 'search_query',
 'unit_price',
 'quantity',
 'uom']

In [52]:
# How many items were purchased from a direct link or browse, ie, an empty `search_query` field?
checkout_items_without_search_query = CHECKOUT_ITEMS_DF[CHECKOUT_ITEMS_DF['search_query'].isnull()]
len(checkout_items_without_search_query)

220649

In [53]:
# ...as a percentage
print("{0:.2f}%".format(len(checkout_items_without_search_query) / len(CHECKOUT_ITEMS_DF) * 100.0))

52.58%


In [60]:
# Annotate item dataframes with "real" Python `list` of search terms

def search_query_to_list(val):
    # val is a string that looks like:
    # "['K40', 'CONTACT', 'POSITIONER']"
    #
    # Each val starts and ends with '[' and ']',
    # so strip those off
    if pd.isnull(val):
        # return NaN for empty cell valus
        return np.nan
    val = val[1:len(val)-1]
    x = csv.reader([val], quotechar="'", skipinitialspace=True)
    return list(x)[0]

def annotate_with_search_query_list(df):
    df['_search_query_list'] = df['search_query'].apply(search_query_to_list)


annotate_with_search_query_list(PARKED_ITEMS_DF)
annotate_with_search_query_list(CHECKOUT_ITEMS_DF)

In [55]:
# TODO: Find repeated search terms

# ...for checkout items


# ...for parked items

In [56]:
# TODO: What's the average price of sold items?


In [57]:
# TODO: What's the average price of parked items?
