In [2]:
import duckdb as dd
import polars as pl

### Data Analytical Pipeline

In [22]:
retail = pl.read_csv("Online Retail_csv.csv", infer_schema_length=10000)
retail = retail.with_columns((retail["Quantity"]*retail["UnitPrice"]).round(2).alias("Total"))

fin_data = retail.group_by("StockCode").agg(
    pl.sum("Total").alias("Total_cost_stock_sold($)").round(2),
    pl.mean("Total").alias("Average_cost_stock_sales($)").round(2),
    pl.min("Quantity").alias("Min_sales"),
    pl.max("Quantity").alias("Max_sales")
    )
fin_data

StockCode,Total_cost_stock_sold($),Average_cost_stock_sales($),Min_sales,Max_sales
str,f64,f64,i64,i64
"""22591""",1353.36,14.55,-50,48
"""20892""",89.22,14.87,-32,2
"""84875D""",63.75,7.08,1,12
"""22625""",13943.65,24.12,-9,48
"""22545""",435.67,5.66,-1,240
…,…,…,…,…
"""72802C""",2112.63,18.06,-288,378
"""71496B""",7.37,1.84,1,6
"""22090""",22594.56,27.83,-40,160
"""90160A""",216.29,13.52,-7,9


### Storing ingested data in duckdb

In [23]:
#connecting duckdb to financial data
conn = dd.connect("fin_data")
conn.execute("CREATE OR REPLACE TABLE finance_data AS SELECT * FROM fin_data")
#converting data to table
conn.table("finance_data")
#executing loaded data into database as dataframe
financial_data = conn.execute("SELECT * FROM finance_data").fetchdf()
financial_data

Unnamed: 0,StockCode,Total_cost_stock_sold($),Average_cost_stock_sales($),Min_sales,Max_sales
0,22591,1353.36,14.55,-50,48
1,20892,89.22,14.87,-32,2
2,84875D,63.75,7.08,1,12
3,22625,13943.65,24.12,-9,48
4,22545,435.67,5.66,-1,240
...,...,...,...,...,...
4065,72802C,2112.63,18.06,-288,378
4066,71496B,7.37,1.84,1,6
4067,22090,22594.56,27.83,-40,160
4068,90160A,216.29,13.52,-7,9
