### libs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

In [78]:
def make_bins(df: pd.DataFrame, col, bins: list) -> pd.Series:

    res = pd.Series(index=df.index, dtype=object)

    for i, bin_edge in enumerate(bins):
        if i == 0:
            res.loc[df[col] < bin_edge] = f"[-Inf, {bin_edge})"
        else:
            res.loc[(df[col] >= bins[i - 1]) & (df[col] < bin_edge)] = (
                f"[{bins[i-1]},{bin_edge})"
            )
    res.loc[df[col] >= bins[-1]] = f"[{bins[-1]},Inf)"
    return res

In [94]:
def cumcount_in_group_by_order(
    df: pd.DataFrame, group_cols: list, order_cols: list, na_position: str = "last"
) -> pd.Series:
    srs = (
        df.groupby(group_cols, dropna=False)
        .apply(
            lambda x: pd.Series(
                range(1, len(x) + 1),
                index=x.sort_values(order_cols, na_position=na_position).index,
            ),
            include_groups=False,
        )
        .reset_index(level=0, drop=True)
    )
    return srs

### data

In [8]:
import seaborn as sns

df = sns.load_dataset("titanic")[["survived", "who", "age", "fare"]]
print(f"{df.shape = }")
print(df.head().to_string())

df.shape = (891, 4)
   survived    who   age     fare
0         0    man  22.0   7.2500
1         1  woman  38.0  71.2833
2         1  woman  26.0   7.9250
3         1  woman  35.0  53.1000
4         0    man  35.0   8.0500


### items

In [82]:
df2 = df.copy()

In [83]:
df2["id"] = range(1, len(df2) + 1)

In [84]:
df2["avg_age_in_who"] = df2.groupby("who", dropna=False)["age"].transform("mean")

In [85]:
df2["cumcount_in_who_by_fare"] = (
    df2.groupby(["who"], dropna=False)
    .apply(
        lambda x: pd.Series(
            range(1, len(x) + 1),
            index=x.sort_values(["fare", "id"], na_position="last").index,
        ),
        include_groups=False,
    )
    .reset_index(level=0, drop=True)
)

In [86]:
src_col = "fare"
col = "fare_bin"

df2.loc[lambda x: x[src_col] < 0, col] = "[-Inf, 0)"
df2.loc[lambda x: (x[src_col] >= 0) & (x[src_col] < 10), col] = "[0,10)"
df2.loc[lambda x: (x[src_col] >= 10) & (x[src_col] < 20), col] = "[10,20)"
df2.loc[lambda x: (x[src_col] >= 20) & (x[src_col] < 30), col] = "[20,30)"
df2.loc[lambda x: (x[src_col] >= 30) & (x[src_col] < 100), col] = "[30,100)"
df2.loc[lambda x: x[src_col] >= 100, col] = "[100,Inf)"

In [90]:
import scipy.stats as stats

df2["fare_adj"] = np.clip(
    df2["fare"] + stats.norm.rvs(loc=0, scale=1, size=len(df2.index)),
    a_min=0,
    a_max=None,
)

In [91]:
df2["fare_adj_bin"] = make_bins(df2, "fare_adj", bins=[0, 10, 20, 30, 100])

In [92]:
df2 = df2.loc[
    lambda x: x["fare"] > 5,
    [
        "id",
        "who",
        "age",
        "fare",
        "fare_adj",
        "fare_bin",
        "fare_adj_bin",
        "avg_age_in_who",
        "cumcount_in_who_by_fare",
    ],
]

### in one go

In [113]:
bins = [0, 10, 20, 30, 100]

df2_2 = df.assign(
    id=lambda x: range(1, len(x) + 1),
    avg_age_in_who=lambda x: x.groupby("who", dropna=False)["age"].transform("mean"),
    cumcount_in_who_by_fare=lambda x: cumcount_in_group_by_order(
        x, group_cols=["who"], order_cols=["fare", "id"], na_position="last"
    ),
    fare_bin=lambda x: make_bins(x, "fare", bins=bins),
    fare_adj=lambda x: np.clip(
        x["fare"] + stats.norm.rvs(loc=0, scale=1, size=len(x.index)),
        a_min=0,
        a_max=None,
    ),
    fare_adj_bin=lambda x: make_bins(x, "fare_adj", bins=bins),
).loc[
    lambda x: x["fare"] > 5,
    [
        "id",
        "who",
        "age",
        "fare",
        "fare_adj",
        "fare_bin",
        "fare_adj_bin",
        "avg_age_in_who",
        "cumcount_in_who_by_fare",
    ],
]
df2_2

Unnamed: 0,id,who,age,fare,fare_adj,fare_bin,fare_adj_bin,avg_age_in_who,cumcount_in_who_by_fare
0,1,man,22.0,7.2500,8.094079,"[0,10)","[0,10)",33.173123,65
1,2,woman,38.0,71.2833,69.478206,"[30,100)","[30,100)",32.000000,212
2,3,woman,26.0,7.9250,8.971495,"[0,10)","[0,10)",32.000000,41
3,4,woman,35.0,53.1000,52.376101,"[30,100)","[30,100)",32.000000,193
4,5,man,35.0,8.0500,6.062056,"[0,10)","[0,10)",33.173123,193
...,...,...,...,...,...,...,...,...,...
886,887,man,27.0,13.0000,13.783107,"[10,20)","[10,20)",33.173123,323
887,888,woman,19.0,30.0000,28.761113,"[30,100)","[20,30)",32.000000,171
888,889,woman,,23.4500,23.632788,"[20,30)","[20,30)",32.000000,137
889,890,man,26.0,30.0000,30.864439,"[30,100)","[30,100)",33.173123,433


In [93]:
print(df2.head().to_string())

   id    who   age     fare   fare_adj  fare_bin fare_adj_bin  avg_age_in_who  cumcount_in_who_by_fare
0   1    man  22.0   7.2500   7.653517    [0,10)       [0,10)       33.173123                       65
1   2  woman  38.0  71.2833  70.589449  [30,100)     [30,100)       32.000000                      212
2   3  woman  26.0   7.9250   7.351425    [0,10)       [0,10)       32.000000                       41
3   4  woman  35.0  53.1000  53.861728  [30,100)     [30,100)       32.000000                      193
4   5    man  35.0   8.0500   9.050229    [0,10)       [0,10)       33.173123                      193


### results

In [117]:
df2.groupby(["who"], dropna=False).agg(
    **{
        "n": ("who", "size"),
        "avg_age": ("age", "mean"),
        "max_avg_age_in_who": ("avg_age_in_who", "max"),
    }
).assign(
    equal = lambda x: np.where(np.abs(x["avg_age"] - x["max_avg_age_in_who"]) < 0.01, True, False)
)

Unnamed: 0_level_0,n,avg_age,max_avg_age_in_who,equal
who,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
child,83,6.369518,6.369518,True
man,520,33.17203,33.173123,True
woman,271,32.0,32.0,True


In [118]:
df2.loc[lambda x:x['who'] == 'child', ['id','who','fare','cumcount_in_who_by_fare']].sort_values(['fare','id'])

Unnamed: 0,id,who,fare,cumcount_in_who_by_fare
875,876,child,7.2250,1
352,353,child,7.2292,2
780,781,child,7.2292,3
14,15,child,7.8542,4
22,23,child,8.0292,5
...,...,...,...,...
435,436,child,120.0000,79
802,803,child,120.0000,80
297,298,child,151.5500,81
305,306,child,151.5500,82


In [119]:
df2.loc[lambda x:x['fare_bin'] != x['fare_adj_bin'],['fare','fare_bin','fare_adj','fare_adj_bin']]

Unnamed: 0,fare,fare_bin,fare_adj,fare_adj_bin
7,21.075,"[20,30)",19.075083,"[10,20)"
9,30.0708,"[30,100)",29.246012,"[20,30)"
33,10.5,"[10,20)",9.228761,"[0,10)"
78,29.0,"[20,30)",31.60557,"[30,100)"
101,7.8958,"[0,10)",10.010543,"[10,20)"
113,9.825,"[0,10)",10.95541,"[10,20)"
122,30.0708,"[30,100)",28.399486,"[20,30)"
172,11.1333,"[10,20)",9.999703,"[0,10)"
207,18.7875,"[10,20)",20.242437,"[20,30)"
209,31.0,"[30,100)",29.613599,"[20,30)"
