In [1]:
import pandas as pd
import duckdb

from src.constants import PRODUCT_DETAILS_COLUMNS, PRODUCT_DETAILS_SCHEMA, GERMAN_PYTHON_ENCODING, PICK_DATA_COLUMNS

In [2]:
temp_df = pd.read_csv(
    "../data/product_data.csv",
    header=None,
    names=PRODUCT_DETAILS_COLUMNS,
    engine="pyarrow",
    dtype_backend="pyarrow",
    encoding=GERMAN_PYTHON_ENCODING
)
temp_df

Unnamed: 0,product_id,description,product_group
0,000052,PUNCH II MSW1 1500mm PUN,35_Leuchten
1,1036628,H05VV-F2X1 5WS 50M,16_Sonderverkäufe
2,1052053,H07RN-F3G1 100M,16_Sonderverkäufe
3,110109,SIEM DELTA Doppelta 2S 5TD2111,32_Schalter_Steckvorrichtg
4,110125,SIEM PLUS Wip Univ ews 5TG7581,32_Schalter_Steckvorrichtg
...,...,...,...
2199588,Z53390,ASJ MD300EB NH-Lasttrennlei,34_Verteiler_Schränke
2199589,Z53392,ASJ MD31B NH-Lasttrennleist,34_Verteiler_Schränke
2199590,Z53394,ASJ MD31HB NH-Lasttrennleis,34_Verteiler_Schränke
2199591,Z50507,ABB LLEG124X560MM24 LED LLEG1,37_Leuchtmittel


In [3]:
temp_df.nunique()

product_id       2199593
description      1422888
product_group         18
dtype: int64

In [2]:
product_details_df = pd.read_csv(
    "../data/product_data.csv",
    header=None,
    names=PRODUCT_DETAILS_COLUMNS,
    dtype=PRODUCT_DETAILS_SCHEMA,
    engine="pyarrow",
    dtype_backend="pyarrow",
    encoding=GERMAN_PYTHON_ENCODING
)
product_details_df.head()

Unnamed: 0,product_id,description,product_group
0,52,PUNCH II MSW1 1500mm PUN,35_Leuchten
1,1036628,H05VV-F2X1 5WS 50M,16_Sonderverkäufe
2,1052053,H07RN-F3G1 100M,16_Sonderverkäufe
3,110109,SIEM DELTA Doppelta 2S 5TD2111,32_Schalter_Steckvorrichtg
4,110125,SIEM PLUS Wip Univ ews 5TG7581,32_Schalter_Steckvorrichtg


In [10]:
duckdb.sql(
    "select product_group, count(1) from product_details_df GROUP BY 1"
    
).to_df()

Unnamed: 0,product_group,count(1)
0,16_Sonderverkäufe,5772
1,18_Haustechnik_Hausgeräte,38959
2,19_Werkzeug,79219
3,2000_Alka_Bürobedarf,78177
4,20_C-Artikel,279
5,31_Install.-Befestigungs-Mat.,337184
6,32_Schalter_Steckvorrichtg,105507
7,33_Schaltgeräte,557163
8,34_Verteiler_Schränke,189348
9,35_Leuchten,446854


In [7]:
duckdb.sql(
    "select * from product_details_df where product_group = '35_Leuchten'"
).to_df()

Unnamed: 0,product_id,description,product_group
0,000052,PUNCH II MSW1 1500mm PUN,35_Leuchten
1,000073,CW4OEE-830M-L150,35_Leuchten
2,000213,.WD/D-L.LED/23W-3000K 295x29,35_Leuchten
3,000288,.BUSCH 553-325-87,35_Leuchten
4,000289,.BUSCH 553-330-87,35_Leuchten
...,...,...,...
446849,X96828,EASYTEC® II Teleskop Abhäng,35_Leuchten
446850,X97066,SMART LIGHT MOVE FUNKSENSOR,35_Leuchten
446851,X97085,Wandhalterung für Fernbedie,35_Leuchten
446852,Z33777,hokal-HLHX/1500 LED 4200 84,35_Leuchten


In [7]:
product_details_df.isnull().sum()

product_id       0
description      0
product_group    0
dtype: int64

In [8]:
temp_df = pd.read_csv(
    "../data/pick_data.csv",
    header=None,
    names=PICK_DATA_COLUMNS,
    engine="pyarrow",
    dtype_backend="pyarrow",
    encoding=GERMAN_PYTHON_ENCODING
)
temp_df

Unnamed: 0,product_id,warehouse_section,origin,order_number,position_in_order,pick_volume,quantity_unit,date
0,000002,SHL,48,07055448,1,29,St,2017-06-30 11:15:24
1,000002,SHL,48,07055448,1,30,St,2017-06-30 11:22:35
2,000002,SHL,48,07055448,1,30,St,2017-06-30 12:04:50
3,000002,SHL,48,07055448,1,20,St,2017-06-30 12:04:51
4,000002,SHL,48,07055448,1,30,St,2017-06-30 12:05:02
...,...,...,...,...,...,...,...,...
33888985,189976,Kabellager,48,06108542,1,30,Mt,2016-12-30 16:29:35
33888986,V68755,SHL,48,06108742,1,1,St,2016-12-30 16:33:44
33888987,260573,SHL,48,06108743,2,1,St,2016-12-30 16:34:04
33888988,451203,SHL,48,06108743,1,1,St,2016-12-30 16:34:04


In [9]:
temp_df.nunique()

product_id              97317
warehouse_section           5
origin                      2
order_number          6928668
position_in_order         340
pick_volume              1690
quantity_unit               9
date                 22874561
dtype: int64

In [10]:
pick_data_df = pd.read_csv(
    "../data/pick_data.csv",
    header=None,
    names=PICK_DATA_COLUMNS,
    engine="pyarrow",
    dtype_backend="pyarrow",
    encoding=GERMAN_PYTHON_ENCODING
).astype(
    {
        "warehouse_section": "category",
        "origin": "category",
        "quantity_unit": "category",
        "position_in_order": "category",
        
    }
)
pick_data_df.head()

Unnamed: 0,product_id,warehouse_section,origin,order_number,position_in_order,pick_volume,quantity_unit,date
0,2,SHL,48,7055448,1,29,St,2017-06-30 11:15:24
1,2,SHL,48,7055448,1,30,St,2017-06-30 11:22:35
2,2,SHL,48,7055448,1,30,St,2017-06-30 12:04:50
3,2,SHL,48,7055448,1,20,St,2017-06-30 12:04:51
4,2,SHL,48,7055448,1,30,St,2017-06-30 12:05:02


In [11]:
pick_data_df.nunique()

product_id              97317
warehouse_section           5
origin                      2
order_number          6928668
position_in_order         340
pick_volume              1690
quantity_unit               9
date                 22874561
dtype: int64

In [12]:
pick_data_df.describe()

Unnamed: 0,pick_volume,date
count,33888990.0,33888990
mean,61.585141,2016-06-23 20:14:35.986060288
min,-2000.0,2011-06-23 00:00:01
25%,1.0,2014-04-22 12:26:30.500000
50%,5.0,2016-10-17 05:09:38
75%,23.0,2018-09-25 15:00:33.750000128
max,200000.0,2020-07-14 11:42:01
std,365.728643,


In [13]:
pick_data_df.isnull().sum()

product_id           0
warehouse_section    0
origin               0
order_number         0
position_in_order    0
pick_volume          0
quantity_unit        0
date                 0
dtype: int64

In [14]:
pick_data_df.dtypes

product_id                  string[pyarrow]
warehouse_section                  category
origin                             category
order_number                string[pyarrow]
position_in_order                  category
pick_volume                  int64[pyarrow]
quantity_unit                      category
date                 timestamp[ns][pyarrow]
dtype: object