In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from pandas._testing import assert_frame_equal

# Overview

Choose whatever language you're most comfortable with to solve these problems.

# Exercise

The ACME inc. tool supply company manages its operations with 3 csv files:

1. `customers.csv` keeps customer information:
    * `id` is a numeric customer id
    * `firstname` is the customer's first name
    * `lastname` is the customer's last name
2. `products.csv` keeps product info:
    * `id` is a numeric product id
    * `name` is the human-readable name
    * `cost` is the product cost in euros
3. `orders.csv` keeps order information:
    * `id` is a numeric order id
    * `customer` is the numeric id of the customer who created the order
    * `products` is a space-separated list of product ids ordered by the customer

Manually dealing with those files is hard and error-prone, and they've asked for your help writing some code to make their lives easier.



In [11]:
#loading and cleaning data
customers = pd.read_csv('interview/customers.csv')
customers.rename(columns={'id':'customer_id'}, inplace=True)
products = pd.read_csv('interview/products.csv')
products.rename(columns={'id':'products_id'}, inplace=True)
orders = pd.read_csv('interview/orders.csv')
orders.rename(columns={'id':'order_id', 'customer':'customer_id', 'products':'products_id'}, inplace=True)
orders.head()

Unnamed: 0,order_id,customer_id,products_id
0,0,0,1 0 1 0
1,1,22,0 5 0 4 5 3 2 1 1
2,2,57,5 2 4
3,3,20,4 3 0 2 4
4,4,28,5 0 5 0 3 4


### Task 2

The marketing department wants to know which customers are interested in each product; they've asked for a `product_customers.csv` file that, for each product, gives the list of customers who have purchased this product:
* `id` numeric product id
* `customer_ids` a space-separated list of customer ids of the customers who have purchased this product



In [3]:
#we're going to make an example to identify the process
input_example = pd.DataFrame({
    'products_id':[1,0,1,0,0],
    'customer_id':[0,0,0,0,1],
}).values.tolist()
input_example

[[1, 0], [0, 0], [1, 0], [0, 0], [0, 1]]

In [4]:
output_example = pd.DataFrame({
    'products_id':[0,1],
    'customer_id': ['0 1','0']
})
output_example

Unnamed: 0,products_id,customer_id
0,0,0 1
1,1,0


In [5]:
def convert_to_list_spaces(list_of_commas):
    return ' '.join((list_of_commas))

def get_customers_by_product_id(products_and_customers):
    products = defaultdict(list)
    
    for product, customer in products_and_customers:
        if str(customer) not in products[product]:
            products[product].append(str(customer))
    
    customers_by_product_id = {}
    for product, customer in products.items():
        customers_by_product_id[product] = convert_to_list_spaces(customer) 
    return pd.DataFrame(customers_by_product_id.items(), columns=['products_id', 'customer_id']).sort_values(by=['products_id']).reset_index(drop=True)


In [6]:
output = get_customers_by_product_id(input_example)
output

Unnamed: 0,products_id,customer_id
0,0,0 1
1,1,0


In [7]:
assert_frame_equal(output,output_example)

In [15]:
#denormalizing products from orders
orders_denormalize = pd.DataFrame(orders['products_id'].str.split(' ').tolist(),index=orders.customer_id).stack()
orders_denormalize = orders_denormalize.reset_index([0,'customer_id'])
orders_denormalize.columns = ['customer_id', 'products_id']
orders_denormalize['products_id'] = orders_denormalize['products_id'].astype('int')
orders_denormalize = orders_denormalize[['products_id', 'customer_id']]
orders_denormalize = orders_denormalize.values.tolist()

In [16]:
product_customers = get_customers_by_product_id(orders_denormalize)

In [17]:
product_customers.to_csv('product_customers.csv', index=False)

In [18]:
product_customers

Unnamed: 0,products_id,customer_id
0,0,0 22 20 28 40 32 5 45 37 38 6 44 50 24 54 59 1...
1,1,0 22 40 32 45 38 51 6 44 34 3 50 24 15 5 41 47...
2,2,22 57 20 40 5 45 37 51 6 44 54 8 15 21 41 48 4...
3,3,22 20 28 32 5 38 51 34 50 24 54 21 41 47 46 44...
4,4,22 57 20 28 51 24 59 36 8 15 21 5 34 19 41 44 ...
5,5,22 57 28 32 5 37 38 6 44 34 3 50 24 54 59 15 2...
