### libs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

In [2]:
def make_bins(df: pd.DataFrame, col, bins: list) -> pd.Series:

    res = pd.Series(index=df.index, dtype=object)

    for i, bin_edge in enumerate(bins):
        if i == 0:
            res.loc[df[col] < bin_edge] = f"[-Inf, {bin_edge})"
        else:
            res.loc[(df[col] >= bins[i - 1]) & (df[col] < bin_edge)] = (
                f"[{bins[i-1]},{bin_edge})"
            )
    res.loc[df[col] >= bins[-1]] = f"[{bins[-1]},Inf)"
    return res

In [3]:
def cumcount_in_group_by_order(
    df: pd.DataFrame, group_cols: list, order_cols: list, na_position: str = "last"
) -> pd.Series:
    srs = (
        df.groupby(group_cols, dropna=False)
        .apply(
            lambda x: pd.Series(
                range(1, len(x) + 1),
                index=x.sort_values(order_cols, na_position=na_position).index,
            ),
            include_groups=False,
        )
        .reset_index(level=0, drop=True)
    )
    return srs

In [4]:
def value_of_extreme_in_group_by_order(
    df: pd.DataFrame,
    group_cols: list,
    order_cols: list,
    target_col: str,
    order_func: callable,
    na_position: str = "last",
) -> pd.Series:
    """callable sig: func(x: pd.DataFrame) -> pd.Series"""

    def func(x: pd.DataFrame) -> pd.Series:
        sorted_x = x.sort_values(order_cols, na_position=na_position)
        return pd.Series(
            np.repeat(sorted_x.head(1)[target_col].values, len(x)),
            index=sorted_x.index,
        )

    if order_func is None:
        order_func = func

    res = (
        df.groupby(group_cols, dropna=False)
        .apply(order_func, include_groups=False)
        .reset_index(level=0, drop=True)
    )

    return res

### data

In [5]:
import seaborn as sns

df = sns.load_dataset("titanic")[["survived", "who", "age", "fare"]]
print(f"{df.shape = }")
print(df.head().to_string())

df.shape = (891, 4)
   survived    who   age     fare
0         0    man  22.0   7.2500
1         1  woman  38.0  71.2833
2         1  woman  26.0   7.9250
3         1  woman  35.0  53.1000
4         0    man  35.0   8.0500


### step by step

In [6]:
df2 = df.copy()

In [7]:
df2["id"] = range(1, len(df2) + 1)

In [8]:
df2["avg_age_in_who"] = df2.groupby("who", dropna=False)["age"].transform("mean")

In [9]:
df2["cumcount_in_who_by_fare"] = (
    df2.groupby(["who"], dropna=False)
    .apply(
        lambda x: pd.Series(
            range(1, len(x) + 1),
            index=x.sort_values(["fare", "id"], na_position="last").index,
        ),
        include_groups=False,
    )
    .reset_index(level=0, drop=True)
)

In [10]:
def func(x: pd.DataFrame) -> pd.Series:
    sorted_x = x.sort_values(["fare", "id"], na_position="last")
    return pd.Series(
        np.repeat(sorted_x.head(1)["age"].values, len(x)),
        index=sorted_x.index,
    )


df2["age_of_lowest_fare_in_who"] = (
    df2.groupby(["who"], dropna=False)
    .apply(func, include_groups=False)
    .reset_index(level=0, drop=True)
)

In [11]:
src_col = "fare"
col = "fare_bin"

df2.loc[lambda x: x[src_col] < 0, col] = "[-Inf, 0)"
df2.loc[lambda x: (x[src_col] >= 0) & (x[src_col] < 10), col] = "[0,10)"
df2.loc[lambda x: (x[src_col] >= 10) & (x[src_col] < 20), col] = "[10,20)"
df2.loc[lambda x: (x[src_col] >= 20) & (x[src_col] < 30), col] = "[20,30)"
df2.loc[lambda x: (x[src_col] >= 30) & (x[src_col] < 100), col] = "[30,100)"
df2.loc[lambda x: x[src_col] >= 100, col] = "[100,Inf)"

In [12]:
import scipy.stats as stats

np.random.seed(42)

df2["fare_adj"] = np.clip(
    df2["fare"] + stats.norm.rvs(loc=2, scale=5, size=len(df2.index)),
    a_min=0,
    a_max=None,
)

In [13]:
df2["fare_adj_bin"] = make_bins(df2, "fare_adj", bins=[0, 10, 20, 30, 100])

In [14]:
df2 = df2.loc[
    lambda x: x["fare"] < 50,
    [
        "id",
        "who",
        "age",
        "fare",
        "fare_adj",
        "fare_bin",
        "fare_adj_bin",
        "avg_age_in_who",
        "cumcount_in_who_by_fare",
        "age_of_lowest_fare_in_who",
    ],
]

In [15]:
print(f"{df2.shape = }")
print(df2.head().to_string())

df2.shape = (730, 10)
   id    who   age     fare   fare_adj fare_bin fare_adj_bin  avg_age_in_who  cumcount_in_who_by_fare  age_of_lowest_fare_in_who
0   1    man  22.0   7.2500  11.733571   [0,10)      [10,20)       33.173123                       65                       36.0
2   3  woman  26.0   7.9250  13.163443   [0,10)      [10,20)       32.000000                       41                       18.0
4   5    man  35.0   8.0500   8.879233   [0,10)       [0,10)       33.173123                      193                       36.0
5   6    man   NaN   8.4583   9.287615   [0,10)       [0,10)       33.173123                      240                       36.0
7   8  child   2.0  21.0750  26.912174  [20,30)      [20,30)        6.369518                       33                       15.0


### in one go with funcs

In [None]:
bins = [0, 10, 20, 30, 100]
np.random.seed(42)

df2_2 = df.assign(
    id=lambda x: range(1, len(x) + 1),
    avg_age_in_who=lambda x: x.groupby("who", dropna=False)["age"].transform("mean"),
    cumcount_in_who_by_fare=lambda x: cumcount_in_group_by_order(
        x, group_cols=["who"], order_cols=["fare", "id"], na_position="last"
    ),
    age_of_lowest_fare_in_who=lambda x: value_of_extreme_in_group_by_order(
        x,
        group_cols=["who"],
        order_cols=["fare", "id"],
        target_col="age",
    ),
    fare_bin=lambda x: make_bins(x, "fare", bins=bins),
    fare_adj=lambda x: np.clip(
        x["fare"] + stats.norm.rvs(loc=2, scale=5, size=len(x.index)),
        a_min=0,
        a_max=None,
    ),
    fare_adj_bin=lambda x: make_bins(x, "fare_adj", bins=bins),
).loc[
    lambda x: x["fare"] < 50,
    [
        "id",
        "who",
        "age",
        "fare",
        "fare_adj",
        "fare_bin",
        "fare_adj_bin",
        "avg_age_in_who",
        "cumcount_in_who_by_fare",
        "age_of_lowest_fare_in_who",
    ],
]
print(f"{df2_2.shape = }")
print(df2_2.head().to_string())

df2_2.shape = (730, 10)
   id    who   age     fare   fare_adj fare_bin fare_adj_bin  avg_age_in_who  cumcount_in_who_by_fare  age_of_lowest_fare_in_who
0   1    man  22.0   7.2500  11.733571   [0,10)      [10,20)       33.173123                       65                       36.0
2   3  woman  26.0   7.9250  13.163443   [0,10)      [10,20)       32.000000                       41                       18.0
4   5    man  35.0   8.0500   8.879233   [0,10)       [0,10)       33.173123                      193                       36.0
5   6    man   NaN   8.4583   9.287615   [0,10)       [0,10)       33.173123                      240                       36.0
7   8  child   2.0  21.0750  26.912174  [20,30)      [20,30)        6.369518                       33                       15.0


In [25]:
assert df2.compare(df2_2).empty, "DataFrames are not equal!"

### check results while using aggs

In [17]:
df2.groupby(["who"], dropna=False).agg(
    **{
        "n": ("who", "size"),
        "avg_age": ("age", "mean"),
        "max_avg_age_in_who": ("avg_age_in_who", "max"),
    }
).assign(
    equal=lambda x: np.where(
        np.abs(x["avg_age"] - x["max_avg_age_in_who"]) < 0.01, True, False
    )
)

Unnamed: 0_level_0,n,avg_age,max_avg_age_in_who,equal
who,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
child,77,6.256494,6.369518,False
man,466,32.5,33.173123,False
woman,187,30.273973,32.0,False


In [18]:
df2.loc[
    lambda x: x["who"] == "child", ["id", "who", "fare", "cumcount_in_who_by_fare"]
].sort_values(["fare", "id"])

Unnamed: 0,id,who,fare,cumcount_in_who_by_fare
875,876,child,7.2250,1
352,353,child,7.2292,2
780,781,child,7.2292,3
14,15,child,7.8542,4
22,23,child,8.0292,5
...,...,...,...,...
43,44,child,41.5792,73
59,60,child,46.9000,74
386,387,child,46.9000,75
480,481,child,46.9000,76


In [19]:
df2.loc[
    lambda x: x["who"] == "man", ["id", "who", "fare", "cumcount_in_who_by_fare"]
].sort_values(["fare", "id"])

Unnamed: 0,id,who,fare,cumcount_in_who_by_fare
179,180,man,0.0000,1
263,264,man,0.0000,2
271,272,man,0.0000,3
277,278,man,0.0000,4
302,303,man,0.0000,5
...,...,...,...,...
583,584,man,40.1250,462
685,686,man,41.5792,463
602,603,man,42.4000,464
83,84,man,47.1000,465


In [20]:
df2.loc[
    lambda x: x["fare_bin"] != x["fare_adj_bin"],
    ["fare", "fare_bin", "fare_adj", "fare_adj_bin"],
]

Unnamed: 0,fare,fare_bin,fare_adj,fare_adj_bin
0,7.2500,"[0,10)",11.733571,"[10,20)"
2,7.9250,"[0,10)",13.163443,"[10,20)"
12,8.0500,"[0,10)",11.259811,"[10,20)"
13,31.2750,"[30,100)",23.708599,"[20,30)"
20,26.0000,"[20,30)",35.328244,"[30,100)"
...,...,...,...,...
877,7.8958,"[0,10)",11.014740,"[10,20)"
880,26.0000,"[20,30)",40.634662,"[30,100)"
884,7.0500,"[0,10)",12.459457,"[10,20)"
885,29.1250,"[20,30)",40.358537,"[30,100)"


In [23]:
_ = df2.loc[
    lambda x: x["cumcount_in_who_by_fare"] == 1,
    [
        "id",
        "who",
        "fare",
        "age",
        "cumcount_in_who_by_fare",
        "age_of_lowest_fare_in_who",
    ],
]
print(_.head().to_string())

      id    who   fare   age  cumcount_in_who_by_fare  age_of_lowest_fare_in_who
179  180    man  0.000  36.0                        1                       36.0
654  655  woman  6.750  18.0                        1                       18.0
875  876  child  7.225  15.0                        1                       15.0
