## Prepare dimension and fact data for ML parts

### Import libs

In [26]:
from pathlib import Path
from shutil import get_terminal_size

import pandas as pd

### Define constants and useful functions

In [27]:
DATA_DIR = Path("../data/").resolve()
read_csv: pd.DataFrame = lambda fn, dm: pd.read_csv(DATA_DIR / f"{fn}.csv", delimiter=dm)
term_c, term_r = get_terminal_size()
sep: str = "\n" + "+-" * (term_c // 2) + "\n"
print_dfs = lambda dfs: print(*dfs, sep=sep)

## Define cleanizer function for each dimension (dimensions which need preprocessing)

In [28]:
def customer_dim_cleanizer() -> pd.DataFrame:
    df: pd.DataFrame = pd.read_csv(DATA_DIR / "customer_dim.csv", delimiter="\t")
    df = pd.get_dummies(df, columns=["Segment"], dtype=bool)
    return df

In [29]:
def geo_dim_cleanizer() -> pd.DataFrame:
    df: pd.DataFrame = pd.read_csv(DATA_DIR / "geo_dim.csv", delimiter="\t")
    df = pd.get_dummies(df, columns=["Region", "Country"], dtype=bool)
    return df

In [30]:
def order_dim_cleanizer() -> pd.DataFrame:
    df: pd.DataFrame = pd.read_csv(DATA_DIR / "order_dim.csv", delimiter="\t")
    priorities: dict[str, int] = {
        "Low": 1,
        "Medium": 2,
        "High": 3,
        "Critical": 4
    }
    for priority_name, priority_val in priorities.items():
        df["Order Priority"] = df["Order Priority"].mask(df["Order Priority"] == priority_name, priority_val)
    df = pd.get_dummies(df, columns=["Ship Mode", "Market"], dtype=bool)    
    return df

In [31]:
def product_dim_cleanizer() -> pd.DataFrame:
    df: pd.DataFrame = pd.read_csv(DATA_DIR / "product_dim.csv", delimiter="\t")
    df = pd.get_dummies(df, columns=["Sub-Category","Category"], dtype=bool)
    return df

In [32]:
customer_df: pd.DataFrame = customer_dim_cleanizer()
geo_df: pd.DataFrame = geo_dim_cleanizer()
order_df: pd.DataFrame = order_dim_cleanizer()
product_df: pd.DataFrame = product_dim_cleanizer()
date_df: pd.DataFrame = read_csv("date_dim", "\t")
fact_df: pd.DataFrame = read_csv("fact", "\t")

dfs: list[pd.DataFrame] = [customer_df, geo_df, order_df, product_df, date_df, fact_df]


In [33]:
def store_cleanized_data(dfs: list[pd.DataFrame], filenames: list[str]) -> None:
    for df, fn in zip(dfs, filenames):
        df.to_csv(DATA_DIR / f"cleanized/{fn}.csv", index=False)
    

In [34]:
print_dfs(dfs)
store_cleanized_data(dfs, ["customer_dim", "geo_dim", "order_dim", "product_dim", "date_dim", "fact"])

     Customer ID     Customer Name  Segment_Consumer  Segment_Corporate  \
0       AA-10315        Alex Avila              True              False   
1       AA-10375      Allen Armold              True              False   
2       AA-10480      Andrew Allen              True              False   
3       AA-10645     Anna Andreadi              True              False   
4         AA-315        Alex Avila              True              False   
...          ...               ...               ...                ...   
1584    VM-21685   Valerie Mitchum             False              False   
1585    VP-11730      Victor Preis             False              False   
1586    VP-21730      Victor Preis             False              False   
1587    VT-11700  Valerie Takahito             False              False   
1588    VT-21700  Valerie Takahito             False              False   

      Segment_Home Office  
0                   False  
1                   False  
2              