This notebook takes a sample of owners and builds their sales by product for the top 1000 products.

In [None]:
import random
import os
from collections import defaultdict
from google.cloud import bigquery
from google.oauth2 import service_account

Do our GBQ set-up stuff.

In [None]:
# These first two values will be different on your machine. 
service_path = "C:\\users\\jchan\\dropbox\\teaching\\"
service_file = 'UMT-MSBA-7b4265df0ca4.json' # this is your authentication information  
gbq_proj_id = 'umt-msba'  # change this to your project_id
gbq_dataset_id = 'wedge_transactions' # and change this to your data set ID

credentials = service_account.Credentials.from_service_account_file(service_path + service_file)
client = bigquery.Client(credentials = credentials, project=gbq_proj_id)

In [None]:
# set number of products
num_prods = 1000

In [None]:
top_prods = set() # throw these in a set for easy lookup.
prods_file = "product_sales.txt"

with open(prods_file) as infile :
    next(infile)
    for idx, row in enumerate(infile.readlines()) :
        prod, sales_total = row.strip().split("\t") 
        top_prods.add(prod)
        
        if idx == num_prods - 1 :
            break

assert(len(top_prods) == num_prods)

Build up our GBQ query.

In [None]:
query_start = """
SELECT card_no,
       lower(description) as description, 
       ROUND(SUM(total),2) AS sales
FROM `umt-msba.wedge_transactions.transArchive_*`
WHERE department != 0 and
      department != 15 and
      trans_status != 'M' and
      trans_status != 'C' and
      trans_status != 'J' and
     (trans_status IS NULL or 
      trans_status = ' ' or 
      trans_status = 'V' or 
      trans_status = 'R') AND
      lower(description) in (
"""

query_end = ") GROUP BY card_no, description"

In [None]:
query = query_start 

for prod in top_prods :
    query = query + "'" + str(prod) + "',"


query = query[:-1] + query_end    

Now let's run it.

In [None]:
# And we execute queries with `client.query`
query_job = client.query(
    query,
    location="US",
)

query_job.result()

Now we'll copy our results over to a default dictionary with two levels of keys.

In [None]:
prod_data = defaultdict(lambda: defaultdict(float))

for row in query_job :
    
    owner, desc, amt = row
    
    prod_data[owner][desc] = amt


Now we're going to write out our results. Notice the header trick below. This is a good way to handle files that have a ton of columns. 

In [None]:
# first, get *list* of owners and products
owners = set()
products = set() 

for owner in prod_data :
    owners.add(owner)
    for desc in prod_data[owner] :
        products.add(desc)
        
owners = sorted(list(owners))
products = sorted(list(products))

In [None]:
headers = ['owner'] + products # the first trick.

with open("owner_level_top_prod_sales.txt",'w') as ofile :
    ofile.write("\t".join(headers) + "\n")
    for owner in owners :
        oline = [owner]
        for product in products : 
            # this is the second trick. It's subtle.
            # Nice feature of defaultdicts--zeros if the
            # owner didn't buy the product, since `float` returns
            # zero.
            oline.append(round(prod_data[owner][product],2))
        
        # Getting this at the right level of indentation is important. 
        ofile.write("\t".join([str(item) for item in oline]) + "\n")


And that's it!