In [44]:
# auoreload shenanigans
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
from plotnine import (
    scale_x_continuous,
    scale_y_continuous,
    scale_x_discrete,
)
import polars as pl

from utils import transform_and_cmp_heights
from utils import plot_bar
from utils import get_percentages_and_concat
from utils import plot_percentage_bars
from utils import get_post_outliers_data
from utils import filter_bad_orders
from utils import pick_one_revision

In [46]:
pl.Config(fmt_str_lengths=50, set_tbl_rows=10)

<polars.config.Config at 0x7f0ad0277820>

In [47]:
partition_by_cols = ["quote_is_admin", "quote_is_freemailer"]

In [48]:
df = get_post_outliers_data()

Data found, loading...


In [49]:
df = filter_bad_orders(df)

Height comparison 
Before: 2652877 
After:  2510458 
Diff:  -142419 (-5.37%)


In [50]:
df_order_revs = df.group_by("order_uuid", "quote_uuid", "quote_revision").agg(
    pl.col("quote_is_freemailer").first(), pl.col("quote_is_admin").first()
)

In [51]:
plot_bar(df_order_revs, x="quote_is_freemailer", y="len").save(
    filename="/home/rnd/projects/demand-data-exploration/big_plots/quote_freemailer_count.png",
    width=8,
    height=5,
)



In [52]:
plot_bar(df_order_revs, x="quote_is_admin", y="len").save(
    filename="/home/rnd/projects/demand-data-exploration/big_plots/quote_is_admin_count.png",
    width=8,
    height=5,
)



In [53]:
df_order_level = pick_one_revision(df, debug=False, level=["order_uuid", "quote_uuid"])

Height comparison 
Before: 2510458 
After:  653578 
Diff:  -1856880 (-73.97%)
Height comparison 
Before: 653578 
After:  423461 
Diff:  -230117 (-35.21%)


In [54]:
plot_bar(df_order_level, x="quote_is_freemailer", y="len").save(
    filename="/home/rnd/projects/demand-data-exploration/big_plots/quote_freemailer_count_one_rev.png",
    width=8,
    height=5,
)



In [55]:
plot_bar(df_order_level, x="quote_is_admin", y="len").save(
    filename="/home/rnd/projects/demand-data-exploration/big_plots/quote_is_admin_count_one_rev.png",
    width=8,
    height=5,
)



In [56]:
# Locked vs non-locked
dfs = {
    partition_col: (
        df_order_level.with_columns(
            locked=pl.col("quote_finalized_at").is_not_null()
        ).partition_by(by=[partition_col], as_dict=True)
    )
    for partition_col in partition_by_cols
}

df_locked_quote_type = get_percentages_and_concat(
    dfs["quote_is_freemailer"], count_col="locked", group_col="quote_is_freemailer"
)

df_locked_quote_internal = get_percentages_and_concat(
    dfs["quote_is_admin"], count_col="locked", group_col="quote_is_admin"
)

In [57]:
plot_percentage_bars(
    df_locked_quote_type,
    x="locked",
    y="percentage_locked",
    facet_col="quote_is_freemailer",
    plot_extras=[scale_y_continuous(breaks=[x / 10 for x in range(0, 10)])],
).save(
    filename="/home/rnd/projects/demand-data-exploration/big_plots/locked_perc_freemailer.png",
    width=8,
    height=5,
)



In [58]:
plot_percentage_bars(
    df_locked_quote_internal,
    x="locked",
    y="percentage_locked",
    facet_col="quote_is_admin",
    plot_extras=[scale_y_continuous(breaks=[x / 10 for x in range(0, 10)])],
).save(
    filename="/home/rnd/projects/demand-data-exploration/big_plots/locked_perc_admin.png",
    width=8,
    height=5,
)



In [16]:
df_locked_quote_internal

locked,count,percentage_locked,quote_is_admin
bool,u32,f64,bool
True,84691,0.237576,False
False,271789,0.762424,False
False,21819,0.325749,True
True,45162,0.674251,True


In [17]:
del dfs, df_locked_quote_type, df_locked_quote_internal

In [26]:
is_hubs_email = (
    pl.col("email").str.contains("3dhubs.com")
    | pl.col("email").str.contains("hubs.com")
    | pl.col("email").str.contains("pthubs.com")
)

df_order_level = df_order_level.with_columns(is_hubs_email=is_hubs_email)

In [22]:
hubs_admin_quotes = df_order_level.filter(
    is_hubs_email, pl.col("quote_is_admin")
).height

admin_quotes = df_order_level.filter(pl.col("quote_is_admin")).height

print(f"{hubs_admin_quotes} / {admin_quotes} = {hubs_admin_quotes / admin_quotes:.2f}")

16271 / 66981 = 0.24


In [25]:
hubs_admin_quotes = df_order_level.filter(
    is_hubs_email,
    pl.col("quote_is_admin"),
    pl.col("quote_finalized_at").is_not_null(),
).height

admin_quotes = df_order_level.filter(
    pl.col("quote_is_admin"), pl.col("quote_finalized_at").is_not_null()
).height

print(f"{hubs_admin_quotes} / {admin_quotes} = {hubs_admin_quotes / admin_quotes:.2f}")

2014 / 45162 = 0.04


In [20]:
df_order_level.filter(pl.col("quote_is_admin"))[
    "quote_email_provider"
].value_counts().sort("count", descending=True)[:10]

quote_email_provider,count
str,u32
"""hubs.com""",12421
"""3dhubs.com""",3850
"""gmail.com""",1170
"""protolabs.de""",631
"""protolabs.com""",582
"""fresenius-kabi.com""",527
"""npcitaly.com""",523
"""rapidobject.com""",479
"""ifm.com""",334
"""smart-solutions.co.il""",247


In [42]:
df_order_level.filter(
    pl.col("quote_is_admin"), pl.col("quote_finalized_at").is_not_null()
)["is_hubs_email"].value_counts().sort("count", descending=True)

is_hubs_email,count
bool,u32
False,43148
True,2014


In [21]:
df_order_level.filter(~is_hubs_email, pl.col("quote_is_admin"))[
    "quote_email_provider"
].value_counts().sort("count", descending=True)[:10]

quote_email_provider,count
str,u32
"""gmail.com""",1170
"""protolabs.de""",631
"""protolabs.com""",582
"""fresenius-kabi.com""",527
"""npcitaly.com""",523
"""rapidobject.com""",479
"""ifm.com""",334
"""smart-solutions.co.il""",247
"""prodrive-technologies.com""",240
"""protolabs.it""",226


In [29]:
df_after_prep = pl.read_ipc(
    "/home/rnd/projects/demand-data-exploration/data/demand_order_data.feather"
).rename({"revision": "quote_revision"})



In [38]:
df_after_prep_order_level = pick_one_revision(
    df_after_prep, level=["order_uuid", "quote_uuid"], debug=False
)

Height comparison 
Before: 614038 
After:  291364 
Diff:  -322674 (-52.55%)
Height comparison 
Before: 291364 
After:  291357 
Diff:  -7 (-0.00%)


In [40]:
df_after_prep_order_level = df_after_prep_order_level.with_columns(
    is_hubs_email=is_hubs_email
)

df_after_prep_order_level.filter(pl.col("is_admin"))["is_hubs_email"].value_counts()

is_hubs_email,count
bool,u32
,6
False,32561
