In [1]:
from functools import lru_cache

import duckdb
import requests
import pandas as pd

pd.set_option('display.max_rows', None)

In [2]:
con = duckdb.connect("/data/duckdb/github_dw.duckdb", read_only=True)

In [3]:
_ = con.execute(
"""
    INSTALL delta;
    LOAD delta;
    
    INSTALL httpfs;
    LOAD httpfs;
    
    CREATE OR REPLACE PERSISTENT SECRET docker_secret (
    TYPE s3,
    PROVIDER config,
    KEY_ID 'datalake',
    SECRET 'datalake',
    URL_STYLE 'path',
    USE_SSL false,
    ENDPOINT 'minio:9000'
    );
"""
)

In [4]:
df = con.sql("FROM duckdb_secrets()")
df

┌───────────────┬─────────┬──────────┬────────────┬────────────┬─────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│     name      │  type   │ provider │ persistent │  storage   │          scope          │                                                                              secret_string                                                                              │
│    varchar    │ varchar │ varchar  │  boolean   │  varchar   │        varchar[]        │                                                                                 varchar                                                                                 │
├───────────────┼─────────┼──────────┼────────────┼────────────┼─────────────────────────┼───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [5]:
_ = con.execute("USE main_cleansed")

In [6]:
con.sql("SELECT * FROM cleansed_location WHERE 1 = 0")

┌─────────┬──────────────┬──────────┬───────────┬────────────────┬────────────────┬───────┬─────────┐
│   id    │ combined_key │ latitude │ longitude │ province_state │ country_region │ fips  │ admin2  │
│ varchar │   varchar    │  double  │  double   │    varchar     │    varchar     │ int32 │ varchar │
├─────────┴──────────────┴──────────┴───────────┴────────────────┴────────────────┴───────┴─────────┤
│                                              0 rows                                               │
└───────────────────────────────────────────────────────────────────────────────────────────────────┘

In [16]:
location_df = con.sql("""
    SELECT COUNT(id),
        COUNT(CASE WHEN combined_key IS NULL THEN 1 ELSE NULL END) AS combined_key_cnt_null , 
        COUNT(CASE WHEN province_state IS NULL THEN 1 ELSE NULL END) AS province_state_cnt_null, 
        COUNT(CASE WHEN country_region IS NULL THEN 1 ELSE NULL END) AS country_region_cnt_null, 
        COUNT(CASE WHEN latitude IS NULL THEN 1 ELSE NULL END) AS latitude_cnt_null, 
        COUNT(CASE WHEN longitude IS NULL THEN 1 ELSE NULL END) AS longitude_cnt_null, 
        COUNT(CASE WHEN fips IS NULL THEN 1 ELSE NULL END) AS fips_cnt_null, 
        COUNT(CASE WHEN admin2 IS NULL THEN 1 ELSE NULL END) AS admin2_cnt_null
    FROM cleansed_location LIMIT 10
""")
location_df

┌───────────┬───────────────────────┬─────────────────────────┬─────────────────────────┬───────────────────┬────────────────────┬───────────────┬─────────────────┐
│ count(id) │ combined_key_cnt_null │ province_state_cnt_null │ country_region_cnt_null │ latitude_cnt_null │ longitude_cnt_null │ fips_cnt_null │ admin2_cnt_null │
│   int64   │         int64         │          int64          │          int64          │       int64       │       int64        │     int64     │      int64      │
├───────────┼───────────────────────┼─────────────────────────┼─────────────────────────┼───────────────────┼────────────────────┼───────────────┼─────────────────┤
│      5322 │                     0 │                       0 │                       0 │                 0 │                  0 │             0 │               0 │
└───────────┴───────────────────────┴─────────────────────────┴─────────────────────────┴───────────────────┴────────────────────┴───────────────┴─────────────────┘

In [66]:
con.sql("SELECT * FROM covid19_raw.raw_github_csse_daily LIMIT 10")

┌──────────────────────────────────┬───────────┬──────────┬───────────┬────────┬────────────────────┬────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────┬───────┬───────┐
│                id                │ confirmed │  deaths  │ recovered │ active │   incident_rate    │ incidence_rate │ case_fatality_ratio │     last_update     │      load_date      │ year  │ month │  day  │
│             varchar              │  double   │  double  │  double   │ int64  │       double       │     double     │       double        │       varchar       │      timestamp      │ int64 │ int64 │ int64 │
├──────────────────────────────────┼───────────┼──────────┼───────────┼────────┼────────────────────┼────────────────┼─────────────────────┼─────────────────────┼─────────────────────┼───────┼───────┼───────┤
│ 6c576b5de057f82f2e0d9fc2d19ff45d │ 9963697.0 │ 130171.0 │      NULL │   NULL │  22045.63713994783 │           NULL │  1.3064528156566784 │ 2023-01-06 04:21:02 │ 2

In [19]:
con.sql("SELECT MIN(last_update), MAX(last_update) FROM covid19_raw.raw_github_csse_daily")

┌──────────────────┬──────────────────┐
│ min(last_update) │ max(last_update) │
│     varchar      │     varchar      │
├──────────────────┼──────────────────┤
│ 1/22/2020 17:00  │ 4/6/20 9:37      │
└──────────────────┴──────────────────┘

In [17]:
con.sql("SELECT * FROM cleansed_location LIMIT 10")

┌──────────────────────────────────┬──────────────────────────────┬─────────────┬──────────────┬────────────────┬────────────────┬───────┬──────────────┐
│                id                │         combined_key         │  latitude   │  longitude   │ province_state │ country_region │ fips  │    admin2    │
│             varchar              │           varchar            │   double    │    double    │    varchar     │    varchar     │ int32 │   varchar    │
├──────────────────────────────────┼──────────────────────────────┼─────────────┼──────────────┼────────────────┼────────────────┼───────┼──────────────┤
│ 3650296a82594e3c2c3a77bf8fee7570 │ Sichuan, China               │     30.6171 │     102.7103 │ Sichuan        │ China          │ -9999 │ Unassigned   │
│ 272c321f2dac912a6dbe1d47234120fd │ Sonora, Mexico               │     29.2972 │    -110.3309 │ Sonora         │ Mexico         │ -9999 │ Unassigned   │
│ 8c0b1ac6cc1acc0bb413aed5d724394d │ Thailand                     │   15.870

In [17]:
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT len, COUNT(1)
    FROM t1
    GROUP BY len
    ORDER BY len DESC
    ;
""")

┌───────┬──────────┐
│  len  │ count(1) │
│ int64 │  int64   │
├───────┼──────────┤
│    19 │  4119846 │
│    16 │       37 │
│    15 │      178 │
│    14 │       73 │
│    13 │    14494 │
│    12 │     8201 │
│    11 │       68 │
└───────┴──────────┘

In [20]:
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len_last_update, *
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT *
    FROM t1
    WHERE len_last_update = 19
    LIMIT 10;

    -- Example Value: 2023-01-06 04:21:02
""")

┌─────────────────┬──────────────────────────────────┬───────────┬──────────┬───────────┬────────┬────────────────────┬────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────┬───────┬───────┐
│ len_last_update │                id                │ confirmed │  deaths  │ recovered │ active │   incident_rate    │ incidence_rate │ case_fatality_ratio │     last_update     │      load_date      │ year  │ month │  day  │
│      int64      │             varchar              │  double   │  double  │  double   │ int64  │       double       │     double     │       double        │       varchar       │      timestamp      │ int64 │ int64 │ int64 │
├─────────────────┼──────────────────────────────────┼───────────┼──────────┼───────────┼────────┼────────────────────┼────────────────┼─────────────────────┼─────────────────────┼─────────────────────┼───────┼───────┼───────┤
│              19 │ 6c576b5de057f82f2e0d9fc2d19ff45d │ 9963697.0 │ 130171.0 │      NULL │   

In [21]:
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len_last_update, *
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT *
    FROM t1
    WHERE len_last_update = 16
    LIMIT 10;

    -- Example Value: 2021-01-15 17:22
""")

┌─────────────────┬──────────────────────────────────┬───────────┬─────────┬───────────┬────────┬────────────────────┬────────────────┬─────────────────────┬──────────────────┬─────────────────────┬───────┬───────┬───────┐
│ len_last_update │                id                │ confirmed │ deaths  │ recovered │ active │   incident_rate    │ incidence_rate │ case_fatality_ratio │   last_update    │      load_date      │ year  │ month │  day  │
│      int64      │             varchar              │  double   │ double  │  double   │ int64  │       double       │     double     │       double        │     varchar      │      timestamp      │ int64 │ int64 │ int64 │
├─────────────────┼──────────────────────────────────┼───────────┼─────────┼───────────┼────────┼────────────────────┼────────────────┼─────────────────────┼──────────────────┼─────────────────────┼───────┼───────┼───────┤
│              16 │ 039b389ba69595065fb4c9e39bbc5fee │  885616.0 │  7138.0 │  876140.0 │   2338 │ 1642.96893

In [22]:
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len_last_update, *
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT *
    FROM t1
    WHERE len_last_update = 15
    LIMIT 10;

    -- Example Value: 1/22/2020 17:00
""")

┌─────────────────┬──────────────────────────────────┬───────────┬────────┬───────────┬────────┬───────────────┬────────────────┬─────────────────────┬─────────────────┬─────────────────────┬───────┬───────┬───────┐
│ len_last_update │                id                │ confirmed │ deaths │ recovered │ active │ incident_rate │ incidence_rate │ case_fatality_ratio │   last_update   │      load_date      │ year  │ month │  day  │
│      int64      │             varchar              │  double   │ double │  double   │ int64  │    double     │     double     │       double        │     varchar     │      timestamp      │ int64 │ int64 │ int64 │
├─────────────────┼──────────────────────────────────┼───────────┼────────┼───────────┼────────┼───────────────┼────────────────┼─────────────────────┼─────────────────┼─────────────────────┼───────┼───────┼───────┤
│              15 │ 755f36722d7e163e6ce4438aec2f95d9 │       1.0 │   NULL │      NULL │   NULL │          NULL │           NULL │       

In [23]:
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len_last_update, *
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT *
    FROM t1
    WHERE len_last_update = 14
    LIMIT 10;

    -- Example Value: 2/1/2020 10:33
""")

┌─────────────────┬──────────────────────────────────┬───────────┬────────┬───────────┬────────┬───────────────┬────────────────┬─────────────────────┬────────────────┬─────────────────────┬───────┬───────┬───────┐
│ len_last_update │                id                │ confirmed │ deaths │ recovered │ active │ incident_rate │ incidence_rate │ case_fatality_ratio │  last_update   │      load_date      │ year  │ month │  day  │
│      int64      │             varchar              │  double   │ double │  double   │ int64  │    double     │     double     │       double        │    varchar     │      timestamp      │ int64 │ int64 │ int64 │
├─────────────────┼──────────────────────────────────┼───────────┼────────┼───────────┼────────┼───────────────┼────────────────┼─────────────────────┼────────────────┼─────────────────────┼───────┼───────┼───────┤
│              14 │ ddf6ce64bc193e0c4914afc6785948d0 │      80.0 │    2.0 │       2.0 │   NULL │          NULL │           NULL │           

In [24]:
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len_last_update, *
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT *
    FROM t1
    WHERE len_last_update = 13
    LIMIT 10;

    -- Example Value: 1/23/20 17:00
""")

┌─────────────────┬──────────────────────────────────┬───────────┬────────┬───────────┬────────┬───────────────┬────────────────┬─────────────────────┬───────────────┬─────────────────────┬───────┬───────┬───────┐
│ len_last_update │                id                │ confirmed │ deaths │ recovered │ active │ incident_rate │ incidence_rate │ case_fatality_ratio │  last_update  │      load_date      │ year  │ month │  day  │
│      int64      │             varchar              │  double   │ double │  double   │ int64  │    double     │     double     │       double        │    varchar    │      timestamp      │ int64 │ int64 │ int64 │
├─────────────────┼──────────────────────────────────┼───────────┼────────┼───────────┼────────┼───────────────┼────────────────┼─────────────────────┼───────────────┼─────────────────────┼───────┼───────┼───────┤
│              13 │ 16f331ef53a23243b8a501196c4dbabc │      NULL │   NULL │      NULL │   NULL │          NULL │           NULL │               

In [27]:
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len_last_update, *
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT *
    FROM t1
    WHERE len_last_update = 12
    LIMIT 10;

    -- Example Value: 4/6/20 23:22
""")

┌─────────────────┬──────────────────────────────────┬───────────┬────────┬───────────┬────────┬───────────────┬────────────────┬─────────────────────┬──────────────┬─────────────────────┬───────┬───────┬───────┐
│ len_last_update │                id                │ confirmed │ deaths │ recovered │ active │ incident_rate │ incidence_rate │ case_fatality_ratio │ last_update  │      load_date      │ year  │ month │  day  │
│      int64      │             varchar              │  double   │ double │  double   │ int64  │    double     │     double     │       double        │   varchar    │      timestamp      │ int64 │ int64 │ int64 │
├─────────────────┼──────────────────────────────────┼───────────┼────────┼───────────┼────────┼───────────────┼────────────────┼─────────────────────┼──────────────┼─────────────────────┼───────┼───────┼───────┤
│              12 │ 4c7454ac7ae65061c65886d8caf0c9b3 │       6.0 │    0.0 │       0.0 │      6 │          NULL │           NULL │                NUL

In [28]:
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len_last_update, *
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT *
    FROM t1
    WHERE len_last_update = 11
    LIMIT 10;

    -- Example Value: 3/8/20 5:19
""")

┌─────────────────┬──────────────────────────────────┬───────────┬────────┬───────────┬────────┬───────────────┬────────────────┬─────────────────────┬─────────────┬─────────────────────┬───────┬───────┬───────┐
│ len_last_update │                id                │ confirmed │ deaths │ recovered │ active │ incident_rate │ incidence_rate │ case_fatality_ratio │ last_update │      load_date      │ year  │ month │  day  │
│      int64      │             varchar              │  double   │ double │  double   │ int64  │    double     │     double     │       double        │   varchar   │      timestamp      │ int64 │ int64 │ int64 │
├─────────────────┼──────────────────────────────────┼───────────┼────────┼───────────┼────────┼───────────────┼────────────────┼─────────────────────┼─────────────┼─────────────────────┼───────┼───────┼───────┤
│              11 │ 56d20d41e1cd9cca2769f9a58b381c8d │     990.0 │    6.0 │     984.0 │      0 │          NULL │           NULL │                NULL │ 

## Data Cleansing

Based on the previous queries, we saw that we have different date formats for the last_update field that we need to standardize. This is an important process especially that we are dealing with timeseries dataset.
The following are some of the examples:

- 2023-01-06 04:21:02 => 19 characters
- 2021-01-15 17:22  => 16 characters
- 1/22/2020 17:00 => 15 characters
- 2/1/2020 10:33 => 14 characters
- 1/23/20 17:00 => 13 characters
- 4/6/20 23:22 => 12 characters
- 3/8/20 5:19 => 11 characters

In [37]:
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len_last_update, *
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT try_strptime(last_update, '%Y-%m-%d %H:%M:%S') AS c_last_update
    FROM t1
    WHERE len_last_update = 19
    LIMIT 20;
""")

┌─────────────────────┐
│    c_last_update    │
│      timestamp      │
├─────────────────────┤
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
│ 2023-01-06 04:21:02 │
├─────────────────────┤
│       20 rows       │
└─────────────────────┘

In [39]:
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len_last_update, *
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT try_strptime(last_update, '%Y-%m-%d %H:%M') AS c_last_update
    FROM t1
    WHERE len_last_update = 16
    LIMIT 20;
""")


┌─────────────────────┐
│    c_last_update    │
│      timestamp      │
├─────────────────────┤
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
│ 2021-01-15 17:22:00 │
├─────────────────────┤
│       20 rows       │
└─────────────────────┘

In [45]:
"""
2/1/2020 10:33 => 14 characters
1/23/20 17:00 => 13 characters
4/6/20 23:22 => 12 characters
3/8/20 5:19 => 11 characters
"""
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len_last_update, *
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT try_strptime(last_update, '%-m/%-d/%Y %H:%M') AS c_last_update
    FROM t1
    WHERE len_last_update = 15
    LIMIT 20;
""")


┌───────────────┐
│ c_last_update │
│   timestamp   │
├───────────────┤
│    0 rows     │
└───────────────┘

In [48]:
"""
2/1/2020 10:33 => 14 characters
1/23/20 17:00 => 13 characters
4/6/20 23:22 => 12 characters
3/8/20 5:19 => 11 characters
"""
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len_last_update, *
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT try_strptime(last_update, '%-m/%-d/%Y %H:%M') AS c_last_update
    FROM t1
    WHERE len_last_update = 14
    LIMIT 20;
""")


┌─────────────────────┐
│    c_last_update    │
│      timestamp      │
├─────────────────────┤
│ 2020-02-01 10:33:00 │
│ 2020-02-01 15:43:00 │
│ 2020-02-01 15:43:00 │
│ 2020-01-31 08:15:00 │
│ 2020-02-01 18:12:00 │
│ 2020-01-31 08:15:00 │
│ 2020-01-31 08:15:00 │
│ 2020-02-01 19:43:00 │
│ 2020-02-01 11:53:00 │
│ 2020-02-01 11:53:00 │
│ 2020-02-01 19:53:00 │
│ 2020-02-01 11:53:00 │
│ 2020-02-01 11:53:00 │
│ 2020-02-01 11:53:00 │
│ 2020-02-01 18:12:00 │
│ 2020-02-01 19:43:00 │
│ 2020-02-01 11:53:00 │
│ 2020-02-01 11:53:00 │
│ 2020-02-01 14:23:00 │
│ 2020-01-31 08:15:00 │
├─────────────────────┤
│       20 rows       │
└─────────────────────┘

In [58]:
"""
1/23/20 17:00 => 13 characters
4/6/20 23:22 => 12 characters
3/8/20 5:19 => 11 characters
"""
con.sql("""
    WITH t1 AS (
        SELECT LENGTH(last_update) AS len_last_update, *
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT try_strptime(last_update, '%-m/%-d/%-y %-H:%-M') AS c_last_update
    FROM t1
    WHERE len_last_update = 13
    LIMIT 20;
""")

┌─────────────────────┐
│    c_last_update    │
│      timestamp      │
├─────────────────────┤
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
│ 2020-01-23 17:00:00 │
├─────────────────────┤
│       20 rows       │
└─────────────────────┘

In [65]:


con.sql("""
    WITH t1 AS (
        SELECT COALESCE(
            try_strptime(last_update, '%Y-%m-%dT%H:%M:%S'),
            try_strptime(last_update, '%Y-%m-%d %H:%M:%S'),
            try_strptime(last_update, '%Y-%m-%d %H:%M'),
            try_strptime(last_update, '%-m/%-d/%Y %H:%M'),
            try_strptime(last_update, '%-m/%-d/%-y %-H:%-M'),
            try_strptime(last_update, '%-m/%-d/%Y %H:%M')
        ) AS c_last_update,
        last_update
        FROM covid19_raw.raw_github_csse_daily
    )
    SELECT *
    FROM t1
""")

┌───────────────┬─────────────┐
│ c_last_update │ last_update │
│   timestamp   │   varchar   │
├───────────────┴─────────────┤
│           0 rows            │
└─────────────────────────────┘

In [90]:
con.sql("SELECT * FROM covid19_raw.raw_github_csse_daily WHERE active > 0 LIMIT 20")

┌──────────────────────────────────┬───────────┬────────┬───────────┬────────┬────────────────────┬────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────┬───────┬───────┐
│                id                │ confirmed │ deaths │ recovered │ active │   incident_rate    │ incidence_rate │ case_fatality_ratio │     last_update     │      load_date      │ year  │ month │  day  │
│             varchar              │  double   │ double │  double   │ int64  │       double       │     double     │       double        │       varchar       │      timestamp      │ int64 │ int64 │ int64 │
├──────────────────────────────────┼───────────┼────────┼───────────┼────────┼────────────────────┼────────────────┼─────────────────────┼─────────────────────┼─────────────────────┼───────┼───────┼───────┤
│ d044dd74e9765cfd76636341047342ae │       1.0 │    0.0 │       0.0 │      1 │ 0.9461006461867414 │           NULL │                 0.0 │ 2021-11-01 04:22:01 │ 2021-10-31 

In [73]:
con.sql("""
    SELECT COUNT(CASE WHEN confirmed IS NULL THEN 1 ELSE NULL END) AS confirmed_null,
        COUNT(CASE WHEN deaths IS NULL THEN 1 ELSE NULL END) AS deaths_null,
        COUNT(CASE WHEN recovered IS NULL THEN 1 ELSE NULL END) AS recovered_null,
        COUNT(CASE WHEN active IS NULL THEN 1 ELSE NULL END) AS active_null,
        COUNT(CASE WHEN incident_rate IS NULL THEN 1 ELSE NULL END) AS incident_rate_null,
        COUNT(CASE WHEN case_fatality_ratio IS NULL THEN 1 ELSE NULL END) AS case_fatality_ratio_null
    FROM covid19_raw.raw_github_csse_daily 
""")

┌────────────────┬─────────────┬────────────────┬─────────────┬────────────────────┬──────────────────────────┐
│ confirmed_null │ deaths_null │ recovered_null │ active_null │ incident_rate_null │ case_fatality_ratio_null │
│     int64      │    int64    │     int64      │    int64    │       int64        │          int64           │
├────────────────┼─────────────┼────────────────┼─────────────┼────────────────────┼──────────────────────────┤
│             28 │         433 │        2683864 │     2693277 │             945434 │                   272460 │
└────────────────┴─────────────┴────────────────┴─────────────┴────────────────────┴──────────────────────────┘

In [75]:
con.sql("""
    SELECT MIN(confirmed),
           AVG(confirmed),
           MAX(confirmed)
    FROM covid19_raw.raw_github_csse_daily 
""")

┌────────────────┬───────────────────┬────────────────┐
│ min(confirmed) │  avg(confirmed)   │ max(confirmed) │
│     double     │      double       │     double     │
├────────────────┼───────────────────┼────────────────┤
│      -302844.0 │ 70638.85375641855 │     38487384.0 │
└────────────────┴───────────────────┴────────────────┘

In [76]:
con.sql("""
    SELECT COUNT(1)
    FROM covid19_raw.raw_github_csse_daily 
    WHERE confirmed < 0
""")

┌──────────┐
│ count(1) │
│  int64   │
├──────────┤
│        4 │
└──────────┘

In [77]:
con.sql("""
    SELECT MIN(deaths),
           AVG(deaths),
           MAX(deaths)
    FROM covid19_raw.raw_github_csse_daily 
""")

┌─────────────┬────────────────────┬─────────────┐
│ min(deaths) │    avg(deaths)     │ max(deaths) │
│   double    │       double       │   double    │
├─────────────┼────────────────────┼─────────────┤
│      -178.0 │ 1007.1828728022742 │    185641.0 │
└─────────────┴────────────────────┴─────────────┘

In [78]:
con.sql("""
    SELECT COUNT(1)
    FROM covid19_raw.raw_github_csse_daily 
    WHERE deaths < 0
""")

┌──────────┐
│ count(1) │
│  int64   │
├──────────┤
│        5 │
└──────────┘

In [79]:
con.sql("""
    SELECT MIN(recovered),
           AVG(recovered),
           MAX(recovered)
    FROM covid19_raw.raw_github_csse_daily 
""")

┌────────────────┬────────────────────┬────────────────┐
│ min(recovered) │   avg(recovered)   │ max(recovered) │
│     double     │       double       │     double     │
├────────────────┼────────────────────┼────────────────┤
│      -854405.0 │ 16152.045607604488 │      6399531.0 │
└────────────────┴────────────────────┴────────────────┘

In [80]:
con.sql("""
    SELECT COUNT(1)
    FROM covid19_raw.raw_github_csse_daily 
    WHERE recovered < 0
""")

┌──────────┐
│ count(1) │
│  int64   │
├──────────┤
│        3 │
└──────────┘

In [81]:
con.sql("""
    SELECT MIN(active),
           AVG(active),
           MAX(active)
    FROM covid19_raw.raw_github_csse_daily 
""")

┌─────────────┬────────────────────┬─────────────┐
│ min(active) │    avg(active)     │ max(active) │
│    int64    │       double       │    int64    │
├─────────────┼────────────────────┼─────────────┤
│           0 │ 7469.3125108649165 │     5658278 │
└─────────────┴────────────────────┴─────────────┘

In [86]:
con.sql("""
    SELECT MIN(incident_rate),
           AVG(incident_rate),
           MAX(incident_rate),
           MEDIAN(incident_rate)
    FROM covid19_raw.raw_github_csse_daily 
""")


┌────────────────────┬────────────────────┬────────────────────┬───────────────────────┐
│ min(incident_rate) │ avg(incident_rate) │ max(incident_rate) │ median(incident_rate) │
│       double       │       double       │       double       │        double         │
├────────────────────┼────────────────────┼────────────────────┼───────────────────────┤
│                0.0 │  16507.17332360432 │   2164021.75951979 │    14572.143730514888 │
└────────────────────┴────────────────────┴────────────────────┴───────────────────────┘

In [85]:
con.sql("""
    SELECT MIN(case_fatality_ratio),
           AVG(case_fatality_ratio),
           MAX(case_fatality_ratio),
           MEDIAN(case_fatality_ratio)
    FROM covid19_raw.raw_github_csse_daily 
""")


┌──────────────────────────┬──────────────────────────┬──────────────────────────┬─────────────────────────────┐
│ min(case_fatality_ratio) │ avg(case_fatality_ratio) │ max(case_fatality_ratio) │ median(case_fatality_ratio) │
│          double          │          double          │          double          │           double            │
├──────────────────────────┼──────────────────────────┼──────────────────────────┼─────────────────────────────┤
│                      0.0 │        2.705073356417832 │                  29600.0 │          1.4961915125136018 │
└──────────────────────────┴──────────────────────────┴──────────────────────────┴─────────────────────────────┘

In [92]:
con.sql("SELECT file_md5, load_date FROM delta_scan('s3://covid-data-pipeline/covid19/github_csse_daily') WHERE active > 0 ORDER BY load_date DESC LIMIT 20")


┌──────────────────────────────────┬─────────────────────┐
│             file_md5             │      load_date      │
│             varchar              │      timestamp      │
├──────────────────────────────────┼─────────────────────┤
│ 1de450a5c1a871be31451bd262f3fcb6 │ 2021-10-31 00:00:00 │
│ a297ffde7bf15463a5799b5cdf5266c7 │ 2021-10-30 00:00:00 │
│ c1a3b944c9417545a6f7de1c90d12059 │ 2021-10-29 00:00:00 │
│ 0f54805f0d28b0f391a4ac20d34953bd │ 2021-09-22 00:00:00 │
│ 0f54805f0d28b0f391a4ac20d34953bd │ 2021-09-22 00:00:00 │
│ 0f54805f0d28b0f391a4ac20d34953bd │ 2021-09-22 00:00:00 │
│ 0f54805f0d28b0f391a4ac20d34953bd │ 2021-09-22 00:00:00 │
│ 0f54805f0d28b0f391a4ac20d34953bd │ 2021-09-22 00:00:00 │
│ 0f54805f0d28b0f391a4ac20d34953bd │ 2021-09-22 00:00:00 │
│ 0f54805f0d28b0f391a4ac20d34953bd │ 2021-09-22 00:00:00 │
│ 0f54805f0d28b0f391a4ac20d34953bd │ 2021-09-22 00:00:00 │
│ 0f54805f0d28b0f391a4ac20d34953bd │ 2021-09-22 00:00:00 │
│ 0f54805f0d28b0f391a4ac20d34953bd │ 2021-09-22 00:00:00

In [None]:


con.sql("SELECT * FROM covid19_raw.raw_github_csse_daily LIMIT 10")

In [129]:
con.sql("""
SELECT id,
       combined_key,
       province_state,
       country_region,
       confirmed * 100000 / coalesce(incident_rate, incidence_rate) AS population,
       confirmed,
       latitude,
       longitude,
       deaths,
       recovered,
       active,
       incident_rate,
       incidence_rate,
       case_fatality_ratio,
       last_update,
       load_date, 
       file_md5
FROM delta_scan('s3://covid-data-pipeline/covid19/github_csse_daily') 
WHERE combined_key = 'Alberta, Canada'
ORDER BY load_date DESC 
LIMIT 50
""")


┌──────────────────────────────────┬─────────────────┬────────────────┬────────────────┬────────────┬───────────┬──────────┬───────────┬────────┬───────────┬────────┬────────────────────┬────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬──────────────────────────────────┐
│                id                │  combined_key   │ province_state │ country_region │ population │ confirmed │ latitude │ longitude │ deaths │ recovered │ active │   incident_rate    │ incidence_rate │ case_fatality_ratio │     last_update     │      load_date      │             file_md5             │
│             varchar              │     varchar     │    varchar     │    varchar     │   double   │  double   │  double  │  double   │ double │  double   │ int64  │       double       │     double     │       double        │       varchar       │      timestamp      │             varchar              │
├──────────────────────────────────┼─────────────────┼────────────────┼───────────

In [102]:
(1000/500000) * 100000

200.0

In [106]:
200/100000 * 500000

2.0