# Ecommerce Data Project
Based on https://github.com/tinybirdco/ecommerce_data_project:



If you have opened the notebook in Google Colab then `Copy to Drive` (see above).

In [1]:
#@title Mount your Google Drive to save and use local files
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)

% cd "/content/gdrive/My Drive/Colab Notebooks/Tinybird/tb_examples"

Mounted at /content/gdrive
/content/gdrive/My Drive/Colab Notebooks/Tinybird/tb_examples


In [13]:
#@title Install Tinybird CLI, libraries and your token
!pip install tinybird-cli -q
!sudo apt-get install jq

import os
import re

if not os.path.isfile('.tinyb'):
  !tb auth

if not os.path.isdir('datasources'):
  !tb init

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  libjq1 libonig4
The following NEW packages will be installed:
  jq libjq1 libonig4
0 upgraded, 3 newly installed, 0 to remove and 40 not upgraded.
Need to get 276 kB of archives.
After this operation, 930 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libonig4 amd64 6.7.0-1 [119 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libjq1 amd64 1.5+dfsg-2 [111 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 jq amd64 1.5+dfsg-2 [45.6 kB]
Fetched 276 kB in 1s (491 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 3.)
debconf: falling back to frontend: Readline
debconf: unable to initialize fron

In [3]:

#@title Helper function to write to files
def write_text_to_file(filename, text):
  with open(filename, 'w') as f: f.write(text)

# Create Data Sources


## 1. Events Data Source

In [4]:
filename="datasources/events.datasource"
text='''
DESCRIPTION > # Events from users
    this contains all the events produced by kafka, there are 4 fixed columns 
    plus a `json` column which contains the rest of the data for that event

SCHEMA >
    date DateTime,
    product_id String,
    user_id String,
    event String,
    extra_data String

ENGINE MergeTree
ENGINE_SORTING_KEY timestamp
'''

write_text_to_file(filename, text)

In [5]:
!tb datasource generate datasources/events.datasource --force

[92m** Generated datasources/current_events.datasource
** => Create it on the server running: $ tb push datasources/current_events.datasource
** => Append data using: $ tb datasource append current_events datasources/current_events.datasource`
[0m
[92m** => Generated fixture datasources/fixtures/current_events.csv[0m


In [7]:
!tb datasource append events https://storage.googleapis.com/tinybird-assets/datasets/guides/events_50M_1.csv

[0m** 🥚 starting import process[0m
[92m** 🐥 done[0m
[92m** Total rows in current_events: 50000000[0m
[92m** Data appended to Data Source 'current_events' successfully![0m
[0m** Data pushed to current_events[0m


In [8]:
!tb datasource append current_events https://storage.googleapis.com/tinybird-assets/datasets/guides/events_50M_2.csv

[0m** 🥚 starting import process[0m
[92m** 🐥 done[0m
[92m** Total rows in current_events: 100000000[0m
[92m** Data appended to Data Source 'current_events' successfully![0m
[0m** Data pushed to current_events[0m


In [9]:
!tb sql "SELECT * FROM events LIMIT 1"

------------------------------------------------
[1;32mdate:[0m 2015-11-22 00:00:00
[1;32mproduct_id:[0m 6acf8c3a-1aaa-11eb-ab5e-acde48001122
[1;32muser_id:[0m 396547
[1;32mevent:[0m remove_item_from_cart
[1;32mextra_data:[0m {"city": "Jackson"}
------------------------------------------------


## 2. Products Data Source

In [49]:
filename="datasources/products_join_sku.datasource"
text='''
SCHEMA >
    sku String,
    color String,
    section_id String,
    title String

# this creates a join table ready to access by sku
# using joinGet('products_join_by_id', 'color', sku)

ENGINE "Join"
#ENGINE_JOIN_STRICTNESS "ANY"
#ENGINE_JOIN_TYPE "LEFT"
ENGINE_KEY_COLUMNS "sku"
'''

write_text_to_file(filename, text)

In [50]:
!tb datasource append products_join_sku https://storage.googleapis.com/tinybird-assets/datasets/guides/products_1.csv

[0m** 🥚 starting import process[0m
[92m** 🐥 done[0m
[92m** Appended 1200000 new rows[0m
[92m** Total rows in products_join_sku: None[0m
[92m** Data appended to Data Source 'products_join_sku' successfully![0m
[0m** Data pushed to products_join_sku[0m


In [52]:
!tb datasource append products_join_sku https://storage.googleapis.com/tinybird-assets/datasets/guides/products_2.csv

[0m** 🥚 starting import process[0m
[92m** 🐥 done[0m
[92m** Appended 1241156 new rows[0m
[92m** Total rows in products_join_sku: None[0m
[92m** Data appended to Data Source 'products_join_sku' successfully![0m
[0m** Data pushed to products_join_sku[0m


In [54]:
!tb sql "SELECT count() FROM products_join_sku"

-----------
| [1;32mcount()[0m |
-----------
| 2441156 |
-----------


In [53]:
!tb sql "SELECT * FROM products_join_sku LIMIT 1"

-----------------------------------------
[1;32msku:[0m 5c9ed212-1aaa-11eb-b04a-acde48001122
[1;32mcolor:[0m burlywood4
[1;32msection_id:[0m 8
[1;32mtitle:[0m Famille Nombreuse/Mlah
-----------------------------------------


## 3. Top Products View Data Source

In [25]:
filename="datasources/top_products_view.datasource"
text='''
SCHEMA >
    date Date,
    top_10 AggregateFunction(topK(10), String),
    total_sales AggregateFunction(sum, Float64)

ENGINE AggregatingMergeTree
ENGINE_SORTING_KEY date
'''

write_text_to_file(filename, text)

In [33]:
!tb push datasources/top_products_view.datasource

[0m** Processing datasources/top_products_view.datasource[0m
[0m** Building dependencies[0m
[0m** Running top_products_view [0m
[92m** 'top_products_view' created[0m
[0m** Not pushing fixtures[0m


# Create Pipes

## Top Product Per Day Pipe

In [44]:
filename="pipes/top_product_per_day.pipe"
text='''
NODE only_buy_events
DESCRIPTION >
    filters all the buy events

SQL >
    SELECT
        toDate(date) date,
        product_id,
        JSONExtractFloat(extra_data, 'price') as price
    FROM events
    where event = 'buy'


NODE top_per_day
SQL >
   SELECT date,
          topKState(10)(product_id) top_10,
          sumState(price) total_sales
   from only_buy_events
   group by date

TYPE materialized
DATASOURCE top_products_view
'''

write_text_to_file(filename, text)

In [45]:
!tb push 'pipes/top_product_per_day.pipe' --force --populate

[0m** Processing pipes/top_product_per_day.pipe[0m
   - /content/gdrive/My Drive/Colab Notebooks/Tinybird/tb_examples
   - /content/gdrive/My Drive/Colab Notebooks/Tinybird/tb_examples/datasources
   - /content/gdrive/My Drive/Colab Notebooks/Tinybird/tb_examples/endpoints[0m
[0m** Building dependencies[0m
[0m** Running top_product_per_day [0m
[0m** Materialized node 'top_per_day' using the Data Source 'top_products_view'[0m
[0m** Populating job url https://api.tinybird.co/v0/jobs/dc201541-4539-44d4-a5fd-5bdbbaed3254[0m
[92m** 'top_product_per_day' created[0m
[0m** Not pushing fixtures[0m


In [46]:
!tb sql "SELECT date, topKMerge(top_10), sumMerge(total_sales) \
FROM top_products_view \
GROUP By date LIMIT 3"

--------------------------------------------------------------------------------
[1;32mdate:[0m 2015-11-22
[1;32mtopKMerge(top_10):[0m ['613b7d5c-1aaa-11eb-be25-acde48001122', '6aa281c6-1aaa-11eb-8191-acde48001122', '6abcda7e-1aaa-11eb-8beb-acde48001122', '5e67cf9a-1aaa-11eb-9ea9-acde48001122', '6ac08980-1aaa-11eb-b57a-acde48001122', '683feaa2-1aaa-11eb-a2ce-acde48001122', '62ff8476-1aaa-11eb-95ad-acde48001122', '6b0fc8e2-1aaa-11eb-8741-acde48001122', '6bfe9152-1aaa-11eb-bdae-acde48001122', '67acde2e-1aaa-11eb-aa6a-acde48001122']
[1;32msumMerge(total_sales):[0m 1338311.1600000334
--------------------------------------------------------------------------------
[1;32mdate:[0m 2015-11-23
[1;32mtopKMerge(top_10):[0m ['637cf622-1aaa-11eb-b10e-acde48001122', '6b926d1a-1aaa-11eb-b29a-acde48001122', '6beb5d1c-1aaa-11eb-9dac-acde48001122', '67be5c9e-1aaa-11eb-8653-acde48001122', '6bc99b50-1aaa-11eb-a9f3-acde48001122', '6688d28c-1aaa-11eb-aef3-acde48001122', '68947c3e-1aaa-11eb-b964-ac

# Create Endpoints

In [60]:
filename="endpoints/sales.pipe"
text='''
DESCRIPTION >
    return sales for a product with color filter
    
NODE only_buy_events
SQL >
    SELECT
        toDate(date) date,
        product_id,
        joinGet('products_join_sku', 'color', product_id) as color,
        JSONExtractFloat(extra_data, 'price') as price
    FROM events
    WHERE event = 'buy'

NODE endpoint
DESCRIPTION >
    return sales for a product with color filter
SQL >
    %
    select date, sum(price) total_sales
    from only_buy_events
    where color = 'dark green'
    group by date
'''

write_text_to_file(filename, text)

In [61]:
!tb push 'endpoints/sales.pipe' --force --populate

[0m** Processing endpoints/sales.pipe[0m
   - /content/gdrive/My Drive/Colab Notebooks/Tinybird/tb_examples
   - /content/gdrive/My Drive/Colab Notebooks/Tinybird/tb_examples/datasources
   - /content/gdrive/My Drive/Colab Notebooks/Tinybird/tb_examples/endpoints[0m
[0m** Building dependencies[0m
[0m** Running sales [0m
[92m** => Test endpoint at https://api.tinybird.co/v0/pipes/sales.json[0m
[92m** 'sales' created[0m
[0m** Not pushing fixtures[0m


In [5]:
!tb sql "SELECT * FROM sales LIMIT 10"

-----------------------------------
| [1;32mdate[0m       |        [1;32mtotal_sales[0m |
-----------------------------------
| 2015-11-22 | 1180.0000000000002 |
| 2015-11-23 |            1510.16 |
| 2015-11-24 |            1947.52 |
| 2015-11-25 | 1193.6600000000003 |
| 2015-11-26 | 1823.5199999999995 |
| 2015-11-27 |            1056.41 |
| 2015-11-28 | 1474.8500000000006 |
| 2015-11-29 |             1718.7 |
| 2015-11-30 | 1540.1900000000003 |
| 2015-12-01 | 1724.2999999999997 |
-----------------------------------


In [18]:
filename="endpoints/top_products_params.pipe"
text='''
NODE endpoint
DESCRIPTION >
    returns top 10 products given start and end dates
SQL >
    %
    select
        date,
        topKMerge(10)(top_10) as top_10
    from top_product_per_day
    where date between {{Date(start)}} and {{Date(end)}}
    group by date
'''

write_text_to_file(filename, text)

In [19]:
!tb push 'endpoints/top_products_params.pipe' --force --populate

[0m** Processing endpoints/top_products_params.pipe[0m
[0m** Building dependencies[0m
[0m** Running top_products_params [0m
[92m** => Test endpoint at https://api.tinybird.co/v0/pipes/top_products_params.json[0m
[92m** 'top_products_params' created[0m
[0m** Not pushing fixtures[0m


https://api.tinybird.co/v0/pipes/top_products_params.json?start=2019-01-01&end=2019-01-05
