In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats

In [2]:
def calculate_signal(
    df,
    col,
    target_col,
    exposure_col=None,
):

    # aggregate
    _ = df.groupby(col, dropna=False, observed=0)
    if exposure_col is not None:
        _ = _.agg(
            count=(target_col, np.size),
            exposure=(exposure_col, "sum"),
            signal=(target_col, "sum"),
        )
        _.rename(columns={"signal": target_col, "exposure": exposure_col}, inplace=True)
    else:
        _ = _.agg(
            count=(target_col, np.size),
            signal=(target_col, "sum"),
        )
        _.rename(
            columns={
                "signal": target_col,
            },
            inplace=True,
        )

    # count based pct & rates
    _["count_pct"] = _["count"].div(_["count"].sum()).mul(100)
    _[f"{target_col}_pct"] = _[f"{target_col}"].div(_[f"{target_col}"].sum()).mul(100)
    _[f"{target_col}_rate"] = _[f"{target_col}"].div(_["count"]).mul(100)

    # exposure based pct & rates
    if exposure_col is not None:
        _[f"{exposure_col}_pct"] = _[exposure_col].div(_[exposure_col].sum()).mul(100)
        _[f"{target_col}_{exposure_col}_rate"] = (
            _[f"{target_col}"].div(_[exposure_col]).mul(100)
        )

    return _

### calcs

In [3]:
df = pd.read_csv("../toydata/titanic.csv")
print(f"{df.shape}")
df.head(1)

(891, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False


In [4]:
edges = [0, 20, 40, 100]
df["age_binned"] = pd.cut(df["age"], bins=edges, right=False)
edges = [0, 10, 50, 100, 600]
df["fare_binned"] = pd.cut(df["fare"], bins=edges, right=False)

df["age_fare_binned"] = (
    df["age_binned"].astype(str) + "_" + df["fare_binned"].astype(str)
)

_ = calculate_signal(
    df, target_col="survived", col="age_fare_binned", exposure_col="fare"
)
_["age_binned"] = [i for i, j in _.index.str.split("_")]
_["fare_binned"] = [j for i, j in _.index.str.split("_")]

In [5]:
_.head(5)

Unnamed: 0_level_0,count,fare,survived,count_pct,survived_pct,survived_rate,fare_pct,survived_fare_rate,age_binned,fare_binned
age_fare_binned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"[0, 20)_[0, 10)",44,340.5668,15,4.938272,4.385965,34.090909,1.186894,4.404422,"[0, 20)","[0, 10)"
"[0, 20)_[10, 50)",101,2457.0833,50,11.335578,14.619883,49.50495,8.563071,2.034933,"[0, 20)","[10, 50)"
"[0, 20)_[100, 600)",11,1836.0208,8,1.234568,2.339181,72.727273,6.398634,0.435725,"[0, 20)","[100, 600)"
"[0, 20)_[50, 100)",8,580.6667,6,0.897868,1.754386,75.0,2.023656,1.033295,"[0, 20)","[50, 100)"
"[20, 40)_[0, 10)",158,1221.4286,29,17.732884,8.479532,18.35443,4.256746,2.374269,"[20, 40)","[0, 10)"


In [13]:
_.pivot(index="age_binned", columns="fare_binned", values="count").round(2).style.map(
    lambda v: "background-color:darkred;" if v < _["count"].sum() / len(_) else None
)

fare_binned,"[0, 10)","[10, 50)","[100, 600)","[50, 100)"
age_binned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[0, 20)",44,101,11,8
"[20, 40)",158,157,23,49
"[40, 100)",34,82,14,33
,100,54,5,18


In [15]:
_2 = _.pivot(index="age_binned", columns="fare_binned", values="survived_rate").round(2)
_2.style.background_gradient(cmap="RdYlGn")

fare_binned,"[0, 10)","[10, 50)","[100, 600)","[50, 100)"
age_binned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[0, 20)",34.09,49.5,72.73,75.0
"[20, 40)",18.35,42.68,78.26,73.47
"[40, 100)",8.82,34.15,71.43,60.61
,20.0,38.89,60.0,44.44


In [None]:
_.pivot(index="age_binned", columns="fare_binned", values="fare").round(2).style.map(
    lambda v: "background-color:darkred;" if v < _["fare"].sum() / len(_) else None
)

fare_binned,"[0, 10)","[10, 50)","[100, 600)","[50, 100)"
age_binned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[0, 20)",340.57,2457.08,1836.02,580.67
"[20, 40)",1221.43,3161.01,4935.43,3426.5
"[40, 100)",249.67,1971.65,2300.93,2290.93
,713.35,1200.68,840.36,1167.67


In [None]:
_2 = _.pivot(
    index="age_binned", columns="fare_binned", values="survived_fare_rate"
).round(2)
_2.style.background_gradient(cmap="RdYlGn")

fare_binned,"[0, 10)","[10, 50)","[100, 600)","[50, 100)"
age_binned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"[0, 20)",4.4,2.03,0.44,1.03
"[20, 40)",2.37,2.12,0.36,1.05
"[40, 100)",1.2,1.42,0.43,0.87
,2.8,1.75,0.36,0.69
