In [46]:
import pandas as pd
import numpy as np
import csv

In [47]:
# Load data from CSVs

# for whatever reason, 'latin-1' needs to be used as the encoding on these files,
# otherwise a Unicode error is thrown
default_encoding = 'latin-1'

PARKED_CARTS_DF = pd.read_csv('data/18f_parked_carts.csv', encoding=default_encoding)
PARKED_ITEMS_DF = pd.read_csv('data/18f_parked_items.csv', encoding=default_encoding)
CHECKOUTS_DF = pd.read_csv('data/18f_checkouts.csv', encoding=default_encoding)

# TODO: what about 18f_checkout_items2.csv ?
CHECKOUT_ITEMS_DF = pd.read_csv('data/18f_checkout_items.csv', encoding=default_encoding)


In [48]:
# Total number of parked items
len(PARKED_ITEMS_DF)

320747

In [49]:
# Total number of checkout items
len(CHECKOUT_ITEMS_DF)

419671

In [50]:
# Display the columns in CHECKOUT_ITEMS_DF
list(CHECKOUT_ITEMS_DF)

['sessionid',
 'user_name',
 'action_time',
 'agency',
 'event_type',
 'order_session_number',
 'bpa_number',
 'ct',
 'mfr',
 'part',
 'product_name',
 'search_query',
 'unit_price',
 'quantity',
 'uom',
 'payment_type']

In [51]:
# Display the columns in PARKED_ITEMS_DF
list(PARKED_ITEMS_DF)

['park_cart_id',
 'bpa_number',
 'ct',
 'event_type',
 'mfr',
 'part',
 'product_name',
 'search_query',
 'unit_price',
 'quantity',
 'uom']

In [52]:
# How many items were purchased from a direct link or browse, ie, an empty `search_query` field?
checkout_items_without_search_query = CHECKOUT_ITEMS_DF[CHECKOUT_ITEMS_DF['search_query'].isnull()]
len(checkout_items_without_search_query)

220649

In [53]:
# ...as a percentage
print("{0:.2f}%".format(len(checkout_items_without_search_query) / len(CHECKOUT_ITEMS_DF) * 100.0))

52.58%


In [71]:
# Annotate item dataframes with a list of search terms for each row

def search_query_to_list(val):
    # val is a string that looks like:
    # "['K40', 'CONTACT', 'POSITIONER']"
    #
    # Each val starts and ends with '[' and ']',
    # so strip those off
    if pd.isnull(val):
        # return empty list for empty cell valus
        return []
    val = val[1:len(val)-1]
    reader = csv.reader([val], quotechar="'", skipinitialspace=True)
    terms = list(reader)[0]
    return [t.strip() for t in terms]

def annotate_with_search_query_list(df):
    df['_search_query_list'] = df['search_query'].apply(search_query_to_list)


annotate_with_search_query_list(PARKED_ITEMS_DF)
annotate_with_search_query_list(CHECKOUT_ITEMS_DF)

In [96]:
# Find repeated search terms

def search_term_counts(df):
    term_counts = dict()
    for term_list in df['_search_query_list']:
        for term in term_list:
            term = term.lower()
            if term in term_counts:
                term_counts[term] += 1
            else:
                term_counts[term] = 1
    return pd.DataFrame(list(term_counts.items()), columns=['term', 'count'])         

# Get search term counts for checkout items
checkout_item_search_term_counts = search_term_counts(CHECKOUT_ITEMS_DF)

# Display the top 20 search terms of checkout item search terms
checkout_item_search_term_counts.sort_values(by=['count'], ascending=False)[:20]

Unnamed: 0,term,count
584,gsa copier paper hammermill,428
585,gsa print paper,428
269,varidesk,403
1884,aa batteries,289
338,7510-01-545-3763,287
577,7510-01-545-3753,262
227,7510015453763,249
4122,pens,240
2737,copy paper,234
1235,ham86700,234


In [95]:
# Get search term counts for parked items
parked_item_search_term_counts = search_term_counts(PARKED_ITEMS_DF)

# Display the top 20 search terms of checkout item search terms
parked_item_search_term_counts.sort_values(by=['count'], ascending=False)[:20]

Unnamed: 0,term,count
701,aa batteries,326
1599,7510-01-545-3763,320
783,7510-01-545-3753,301
624,scissors,286
35,pens,285
403,aaa batteries,269
394,stapler,261
122,highlighters,258
238,7510015453763,246
778,black,245


In [None]:
# TODO: Instead of using exact queries, what about segmenting by words or lexemes?
# ie, finding counts for all terms that are like "pen" - "red pens", "ballpoint pens", "pen", etc

In [56]:
# TODO: What's the average price of sold items?


In [57]:
# TODO: What's the average price of parked items?
