### libs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

In [2]:
def make_bins(df: pd.DataFrame, col, bins: list) -> pd.Series:

    res = pd.Series(index=df.index, dtype=object)

    for i, bin_edge in enumerate(bins):
        if i == 0:
            res.loc[df[col] < bin_edge] = f"[-Inf, {bin_edge})"
        else:
            res.loc[(df[col] >= bins[i - 1]) & (df[col] < bin_edge)] = (
                f"[{bins[i-1]},{bin_edge})"
            )
    res.loc[df[col] >= bins[-1]] = f"[{bins[-1]},Inf)"
    return res

### data

In [23]:
import seaborn as sns

df = sns.load_dataset("titanic")[["survived", "who", "age", "fare"]]
print(f"{df.shape = }")
print(df.head().to_string())

df.shape = (891, 4)
   survived    who   age     fare
0         0    man  22.0   7.2500
1         1  woman  38.0  71.2833
2         1  woman  26.0   7.9250
3         1  woman  35.0  53.1000
4         0    man  35.0   8.0500


In [24]:
# expand df by 10000 times
df = pd.concat([df] * 10000, ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"{df.shape = }")

df.shape = (8910000, 4)


### step by step

In [25]:
df2 = df.copy()

In [26]:
df2["id"] = range(1, len(df2) + 1)

In [27]:
df2["avg_age_in_who"] = df2.groupby("who", dropna=False)["age"].transform("mean")

In [28]:
df2["cumcount_in_who_by_fare"] = (
    df2.sort_values(["fare", "id"], na_position="last")
    .groupby("who", dropna=False)
    .cumcount()
    + 1
)

In [29]:
df2["age_of_lowest_fare_in_who"] = (
    df2.sort_values(["fare", "id"], na_position="last")
    .groupby("who", dropna=False)["age"]
    .transform("first")
)

In [30]:
src_col = "fare"
col = "fare_bin"

df2.loc[lambda x: x[src_col] < 0, col] = "[-Inf, 0)"
df2.loc[lambda x: (x[src_col] >= 0) & (x[src_col] < 10), col] = "[0,10)"
df2.loc[lambda x: (x[src_col] >= 10) & (x[src_col] < 20), col] = "[10,20)"
df2.loc[lambda x: (x[src_col] >= 20) & (x[src_col] < 30), col] = "[20,30)"
df2.loc[lambda x: (x[src_col] >= 30) & (x[src_col] < 100), col] = "[30,100)"
df2.loc[lambda x: x[src_col] >= 100, col] = "[100,Inf)"

In [31]:
import scipy.stats as stats

np.random.seed(42)

df2["fare_adj"] = np.clip(
    df2["fare"] + stats.norm.rvs(loc=2, scale=5, size=len(df2.index)),
    a_min=0,
    a_max=None,
)

In [32]:
df2["fare_adj_bin"] = make_bins(df2, "fare_adj", bins=[0, 10, 20, 30, 100])

In [33]:
df2 = df2.loc[
    lambda x: x["fare"] < 50,
    [
        "id",
        "who",
        "age",
        "fare",
        "fare_adj",
        "fare_bin",
        "fare_adj_bin",
        "avg_age_in_who",
        "cumcount_in_who_by_fare",
        "age_of_lowest_fare_in_who",
    ],
]

In [34]:
print(f"{df2.shape = }")
print(df2.head().to_string())

df2.shape = (7300000, 10)
   id    who   age     fare   fare_adj fare_bin fare_adj_bin  avg_age_in_who  cumcount_in_who_by_fare  age_of_lowest_fare_in_who
1   2    man  48.0   7.8542   9.162878   [0,10)       [0,10)       33.173123                  1310001                       36.0
2   3  woman  26.0   7.9250  13.163443   [0,10)      [10,20)       32.000000                   400001                       18.0
4   5    man  30.0  24.0000  24.829233  [20,30)      [20,30)       33.173123                  3730001                       36.0
5   6  woman  28.0   7.8958   8.725115   [0,10)       [0,10)       32.000000                   390001                       18.0
6   7    man  31.0  13.0000  22.896064  [10,20)      [20,30)       33.173123                  2950001                       36.0


### in one go with funcs

In [35]:
bins = [0, 10, 20, 30, 100]
np.random.seed(42)

df2_2 = df.assign(
    id=lambda x: range(1, len(x) + 1),
    avg_age_in_who=lambda x: x.groupby("who", dropna=False)["age"].transform("mean"),
    cumcount_in_who_by_fare=lambda x: x.sort_values(["fare", "id"], na_position="last")
    .groupby("who", dropna=False)
    .cumcount()
    + 1,
    age_of_lowest_fare_in_who=lambda x: x.sort_values(
        ["fare", "id"], na_position="last"
    )
    .groupby("who", dropna=False)["age"]
    .transform("first"),
    fare_bin=lambda x: make_bins(x, "fare", bins=bins),
    fare_adj=lambda x: np.clip(
        x["fare"] + stats.norm.rvs(loc=2, scale=5, size=len(x.index)),
        a_min=0,
        a_max=None,
    ),
    fare_adj_bin=lambda x: make_bins(x, "fare_adj", bins=bins),
).loc[
    lambda x: x["fare"] < 50,
    [
        "id",
        "who",
        "age",
        "fare",
        "fare_adj",
        "fare_bin",
        "fare_adj_bin",
        "avg_age_in_who",
        "cumcount_in_who_by_fare",
        "age_of_lowest_fare_in_who",
    ],
]
print(f"{df2_2.shape = }")
print(df2_2.head().to_string())

df2_2.shape = (7300000, 10)
   id    who   age     fare   fare_adj fare_bin fare_adj_bin  avg_age_in_who  cumcount_in_who_by_fare  age_of_lowest_fare_in_who
1   2    man  48.0   7.8542   9.162878   [0,10)       [0,10)       33.173123                  1310001                       36.0
2   3  woman  26.0   7.9250  13.163443   [0,10)      [10,20)       32.000000                   400001                       18.0
4   5    man  30.0  24.0000  24.829233  [20,30)      [20,30)       33.173123                  3730001                       36.0
5   6  woman  28.0   7.8958   8.725115   [0,10)       [0,10)       32.000000                   390001                       18.0
6   7    man  31.0  13.0000  22.896064  [10,20)      [20,30)       33.173123                  2950001                       36.0


In [36]:
assert df2.compare(df2_2).empty, "DataFrames are not equal!"

### check results while using aggs

In [37]:
df2.groupby(["who"], dropna=False).agg(
    **{
        "n": ("who", "size"),
        "avg_age": ("age", "mean"),
        "max_avg_age_in_who": ("avg_age_in_who", "max"),
    }
).assign(
    equal=lambda x: np.where(
        np.abs(x["avg_age"] - x["max_avg_age_in_who"]) < 0.01, True, False
    )
)

Unnamed: 0_level_0,n,avg_age,max_avg_age_in_who,equal
who,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
child,770000,6.256494,6.369518,False
man,4660000,32.5,33.173123,False
woman,1870000,30.273973,32.0,False


In [40]:
df2.loc[
    lambda x: x["who"] == "child", ["id", "who", "fare", "cumcount_in_who_by_fare"]
].sort_values(["fare", "id"]).head(20)

Unnamed: 0,id,who,fare,cumcount_in_who_by_fare
116,117,child,7.225,1
331,332,child,7.225,2
1631,1632,child,7.225,3
2317,2318,child,7.225,4
2673,2674,child,7.225,5
3223,3224,child,7.225,6
6341,6342,child,7.225,7
7972,7973,child,7.225,8
8565,8566,child,7.225,9
10343,10344,child,7.225,10


In [41]:
df2.loc[
    lambda x: x["who"] == "man", ["id", "who", "fare", "cumcount_in_who_by_fare"]
].sort_values(["fare", "id"]).head(20)

Unnamed: 0,id,who,fare,cumcount_in_who_by_fare
227,228,man,0.0,1
252,253,man,0.0,2
309,310,man,0.0,3
324,325,man,0.0,4
344,345,man,0.0,5
352,353,man,0.0,6
556,557,man,0.0,7
565,566,man,0.0,8
603,604,man,0.0,9
664,665,man,0.0,10


In [43]:
df2.loc[
    lambda x: x["fare_bin"] != x["fare_adj_bin"],
    ["fare", "fare_bin", "fare_adj", "fare_adj_bin"],
].head(20)

Unnamed: 0,fare,fare_bin,fare_adj,fare_adj_bin
2,7.925,"[0,10)",13.163443,"[10,20)"
6,13.0,"[10,20)",22.896064,"[20,30)"
7,18.0,"[10,20)",23.837174,"[20,30)"
9,7.8958,"[0,10)",12.6086,"[10,20)"
24,30.0708,"[30,100)",29.348886,"[20,30)"
27,9.825,"[0,10)",13.70349,"[10,20)"
31,7.2292,"[0,10)",18.490591,"[10,20)"
34,27.0,"[20,30)",33.112725,"[30,100)"
36,7.925,"[0,10)",10.969318,"[10,20)"
39,7.125,"[0,10)",10.109306,"[10,20)"


In [44]:
_ = df2.loc[
    lambda x: x["cumcount_in_who_by_fare"] == 1,
    [
        "id",
        "who",
        "fare",
        "age",
        "cumcount_in_who_by_fare",
        "age_of_lowest_fare_in_who",
    ],
]
print(_.head().to_string())

        id    who   fare   age  cumcount_in_who_by_fare  age_of_lowest_fare_in_who
116    117  child  7.225  15.0                        1                       15.0
227    228    man  0.000   NaN                        1                       36.0
2194  2195  woman  6.750  18.0                        1                       18.0
