# Mixed Models

The so called "Simpson's Paradox" is just a mixed-effect model.

In [1]:
import altair as alt
import numpy as np
import pandas as pd

from utils import title

SEED = 0
SIZE = 500


def make_regression_df(
    rng: np.random.Generator,
    x_center: float,
    y_center: float,
    x_variance: float = 0.4,
    y_variance: float = 0.8,
    size: int = SIZE,
) -> pd.DataFrame:
    x = rng.normal(x_center, x_variance, size=size)
    y = y_center + 0.5 * (x - x_center) + rng.normal(0, y_variance, size=size)

    return pd.DataFrame({"x": x, "y": y})


def plot_simpsons_paradox(
    df: pd.DataFrame,
    x: str = "x",
    y: str = "y",
    group: str = "group",
    x_title: str = "X Variable",
    y_title: str = "Y Variable",
    group_title: str = "Group",
    chart_title: str = "",
) -> alt.HConcatChart:
    x_axis = alt.X(f"{x}:Q", title=f"{x_title}")
    y_axis = alt.Y(f"{y}:Q", title=f"{y_title}")
    colors = alt.Color(f"{group}:N", title=f"{group_title}")

    base = alt.Chart(df).encode(x=x_axis, y=y_axis).properties(width=400, height=300)

    overall_scatter = base.mark_circle(size=30, opacity=0.7)
    overall_regression = base.mark_line(strokeWidth=3, strokeDash=[5, 5], color="black").transform_regression(f"{x}", f"{y}")

    group_scatter = base.mark_circle(size=30, opacity=0.3).encode(color=colors)
    group_regression = base.mark_line(strokeWidth=2).transform_regression(f"{x}", f"{y}", groupby=[f"{group}"]).encode(color=colors)

    return alt.hconcat(
        alt.layer(overall_scatter, overall_regression),
        alt.layer(group_scatter, group_regression, overall_regression),
    ).properties(title=title(chart_title))


rng = np.random.default_rng(SEED)
group_configs = [
    {"x_center": 1.0, "y_center": 4.0, "label": "Group A"},
    {"x_center": 2.0, "y_center": 3.0, "label": "Group B"},
    {"x_center": 3.0, "y_center": 2.0, "label": "Group C"},
    {"x_center": 4.0, "y_center": 1.0, "label": "Group D"},
]

groups = [
    make_regression_df(
        rng,
        config["x_center"],
        config["y_center"],
    ).assign(group=config["label"])
    for config in group_configs
]

df = pd.concat(groups, ignore_index=True)


plot_simpsons_paradox(df, chart_title="Simpson's Paradox")

In [2]:
df_fixed_effects = df.groupby("group", as_index=False).mean().rename(columns={"x": "x_mean", "y": "y_mean"})
df_transformed = df.merge(df_fixed_effects)
df_transformed = df_transformed.assign(
    x_transformed=df_transformed["x"] - df_transformed["x_mean"],
    y_transformed=df_transformed["y"] - df_transformed["y_mean"],
).filter(["x_transformed", "y_transformed", "group"])

plot_simpsons_paradox(
    df_transformed,
    x="x_transformed",
    y="y_transformed",
    chart_title="No paradox when accounting for fixed effects",
)