This notebook takes a sample of owners and builds their sales by product for the top 1000 products.

In [None]:
import random
import os
from collections import defaultdict

In [None]:
top_prods = set() # throw these in a set for easy lookup.
top_1000_file = "top_1000_prods.tsv"

with open(top_1000_file) as infile :
    next(infile)
    for row in infile.readlines() :
        prod, sales_total = row.strip().split("\t") 
        top_prods.add(prod)


Now we'll point our script at our owner files, take a sample of a desired size, and build our sales data. 

In [None]:
owner_file_location = "C:/Users/jchan/Dropbox/Teaching/CorporatePartners/Wedge/Data/OwnerFiles/"
#owner_file_location = "C:/Users/jchan/Dropbox/Teaching/CorporatePartners/Wedge/Data/OwnerSample/"

num_owner_files_to_sample = 20

owner_files = [f for f in os.listdir(owner_file_location) if "NonOwners" not in f]

random.seed(20171211)
owner_files = random.sample(owner_files,num_owner_files_to_sample)

In [None]:
sales_holder = defaultdict(lambda: # owner key 
                           defaultdict(float)) # product key. Value is sales

In [None]:
for of in owner_files :
    print("On file " + of)
    with open(owner_file_location + of,'r') as infile :
        next(infile)
        for row in infile.readlines() :
            row = row.strip().split('\t')
            owner = row[45]
            desc = row[5]
            sales = row[14]
            if desc in top_prods :
                sales_holder[owner][desc] += float(sales)
       

Now we're going to write out our results. Notice the header trick below. This is a good way to handle files that have a ton of columns. 

In [None]:
prod_list = list(top_prods)
headers = ['owner'] + prod_list # the first trick.

with open("owner_level_top_prod_sales.txt",'w') as ofile :
    ofile.write("\t".join(headers) + "\n")
    for owner in sales_holder :
        oline = [owner]
        for product in prod_list : # this is the second trick. It's subtle.
            # Nice feature of defaultdicts--zeros if the
            # owner didn't buy the product, since `float` returns
            # zero.
            oline.append(round(sales_holder[owner][product],2))
        
        # Getting this at the right level of indentation is important. 
        ofile.write("\t".join([str(item) for item in oline]) + "\n")


And that's it!