[Reference](https://medium.com/h7w/supercharge-your-sql-analysis-with-python-and-duckdb-d28acaa0f697)

# Database Creation

In [1]:
# Import libraries
import polars as pl
import duckdb as db
import plotly.express as px

In [2]:
# Create database
conn = db.connect('retail_db.db')

In [3]:
# Create table
conn.sql('''
    create table if not exists retail_sales (
        id INT,
        sale_date DATE,
        sale_time TIME,
        customer_id INT,
        gender VARCHAR(10),
        age INT,
        category VARCHAR(35),
        quantity INT,
        price_per_unit FLOAT,
        cogs FLOAT,
        total_sale FLOAT
        )
''')

# Data Ingestion

In [6]:
# Insert data into table from csv file
conn.sql('''
    INSERT INTO retail_sales
    SELECT * FROM read_csv('sales.csv')
''')

# Data Exploration and Cleaning

In [7]:
# Show first 10 records
conn.sql('select * exclude(cogs) from retail_sales limit 10').pl()

In [8]:
conn.sql('select count(*) as records from retail_sales').pl()

In [9]:
conn.sql('select count(distinct customer_id) customers from retail_sales').pl()

In [10]:
conn.sql('select distinct category from retail_sales').pl()

In [11]:
conn.sql('select distinct category from retail_sales').pl()

# Data Analysis

In [12]:
conn.sql('''
    select *
        exclude(cogs)
    from retail_sales
    where sale_date = '2023-11-23'
''').pl()

In [13]:
conn.sql('''
    select *
        exclude(cogs)
    from retail_sales
    where category = 'Clothing'
        and extract('month' from sale_date) = '11'
        and quantity >= 2
''').pl()

In [14]:
sales = conn.sql('''
    select
        category
        , round(sum(total_sale),2) as net_sale
        , count(*) as total_orders
    from retail_sales
    group by 1
    order by total_orders desc
''').pl()

In [15]:
fig = px.bar(sales,
             x="net_sale",
             y="category",
             orientation='h',
             hover_data=['category','net_sale',],
            )

fig.update_layout(width=850,
                  height=500,
                  title_text='<i>Sales by Category during 2023</i>',
                  title_x=0.2,
                  template="ggplot2",
                  yaxis={'categoryorder':'total ascending'}
                 )

fig.show()