## ARPU & ARPPU

In [12]:
from IPython.display import display
import matplotlib.pyplot as plt

In [47]:
from scipy.stats import permutation_test
import pandas as pd
import numpy as np

alpha = 0.05

exp_testid = 32412
control_testid = 32413


def revenues_by_user(df: pd.DataFrame) -> pd.DataFrame:
    return df.groupby(["userid"]).agg(
        userid=pd.NamedAgg(column="userid", aggfunc="first"),
        value=pd.NamedAgg(column="value", aggfunc="sum"),
    ).reset_index(drop=True)


def statistic_arpu(a, b):
    return np.mean(a) - np.mean(b)


def statistic_arppu(a, b):
    return np.mean(a[a != 0]) - np.mean(b[b != 0])


def process(fname: str):
    df = pd.read_csv(fname, delimiter="\t")

    df["action"] = df["action"].apply(lambda x: 1 if x == "confirmation" else 0)
    df["value"] = df["value"] * df["action"]
    df.drop(["timestamp", "action"], axis=1, inplace=True)
    
    df_exp = df[df["testids"].str.contains(str(exp_testid))].drop(["testids"], axis=1)
    df_control = df[df["testids"].str.contains(str(control_testid))].drop(["testids"], axis=1)
    
    df_exp = revenues_by_user(df_exp)
    df_control = revenues_by_user(df_control)
    
    for method in [statistic_arpu, statistic_arppu]:
        np.random.seed(177)
        result = permutation_test((df_exp["value"], df_control["value"]), method, permutation_type='independent')

        c = "red" if result.statistic < 0 else "green"
        print(round(result.pvalue, 3), c if result.pvalue < alpha else "gray")


if __name__ == '__main__':
    process(input())


0.077 gray
0.006 green


In [31]:
df = pd.read_csv("exp_arppu.tsv", delimiter="\t")

# df = df[df["action"] == "confirmation"] 
df["action"] = df["action"].apply(lambda x: 1 if x == "confirmation" else 0)
df["value"] = df["value"] * df["action"]
df.drop(["timestamp", "action"], axis=1, inplace=True)

df.info()
df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49757 entries, 0 to 49756
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userid   49757 non-null  object 
 1   value    49757 non-null  float64
 2   testids  49757 non-null  object 
dtypes: float64(1), object(2)
memory usage: 1.1+ MB


Unnamed: 0,userid,value,testids
0,user_10000,0.0,13535;23346;23464;25661
1,user_10000,0.0,13535;23346;23464;25661
2,user_10000,0.0,13535;23346;23464;25661
3,user_10000,0.0,13535;23346;23464;25661
4,user_10000,0.0,13535;23346;23464;25661
5,user_10000,0.0,13535;23346;23464;25661
6,user_10000,0.0,13535;23346;23464;25661
7,user_10000,0.0,13535;23346;23464;25661
8,user_10000,0.0,13535;23346;23464;25661
9,user_10000,0.0,13535;23346;23464;25661


In [37]:
df_exp = df[df["testids"].str.contains(str(exp_testid))].drop(["testids"], axis=1)
df_control = df[df["testids"].str.contains(str(control_testid))].drop(["testids"], axis=1)

display(df_exp.head(4))
display(df_control.head(4))

Unnamed: 0,userid,value
145,user_10001,0.0
146,user_10001,0.0
147,user_10001,0.0
148,user_10001,0.0


Unnamed: 0,userid,value
318,user_10003,0.0
319,user_10003,0.0
320,user_10003,0.0
321,user_10003,0.0


In [33]:
set(df_exp["userid"]) & set(df_control["userid"])

set()

In [39]:
df_exp = revenues_by_user(df_exp)
df_control = revenues_by_user(df_control)

df_exp.head(10)

Unnamed: 0,userid,value
0,user_10001,32400.0
1,user_10005,0.0
2,user_10012,24700.0
3,user_10017,115900.0
4,user_10030,0.0
5,user_10037,0.0
6,user_10039,0.0
7,user_10042,142000.0
8,user_10045,0.0
9,user_10046,0.0


In [41]:
res_arpu = permutation_test((df_exp["value"], df_control["value"]), statistic_arpu, permutation_type='independent')

print(res_arpu.pvalue)

0.0002
[-201.35104211 -360.14797199  332.43622821 ... -360.14797199 -246.31161864
 -360.14797199]
17190.35656862525
