# 0.0 Imports

In [1]:
import pandas as pd
import numpy as np
import math

from statsmodels.stats import api as sms
from scipy.stats import chi2_contingency

## 0.1 Loading

In [2]:
path = 'C:/Users/edils/repos/teste_ab/data/'

df_raw = pd.read_csv(path + 'ab_data.csv')

In [3]:
df_raw.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


# 1.0 Design do Experimento

## 1.1 Formulação das Hipóteses

In [4]:
# H0 : A conversão da nova página é de 13%
# H1 : A conversão da nova página é diferente de 15%

# 2.0 Parametros do Experimento

In [9]:
df2 = df_raw.copy()

In [10]:
#nivel de confiança
confidence_level = 0.95

#nivel de significancia
significance_level = 0.05

#conversoes da pagina atual e nova
p1 = 0.13
p2 = 0.15

#tamanho do efeito
effect_size = sms.proportion_effectsize(p1, p2)

#poder Estatistico
power = 0.80

In [11]:
#Tamanho da amostra
sample_n = sms.NormalIndPower().solve_power(
            effect_size,
            power=power,
            alpha=significance_level
)

sample_n = math.ceil(sample_n)

In [12]:
print(f"O tamanho total da amostra é: {2*sample_n}")
print(f"O tamanho da amostra do grupo de controle é: {sample_n}")
print(f"O tamanho da amostra do grupo de tratamento é: {sample_n}")

O tamanho total da amostra é: 9440
O tamanho da amostra do grupo de controle é: 4720
O tamanho da amostra do grupo de tratamento é: 4720


In [13]:
# Preparar dos dados
# Divide duplicate users based on group and landing_page
df_query1 = df2.loc[:, ['user_id', 'group', 'landing_page']].groupby('user_id').nunique().query('landing_page > 1 or group > 1').reset_index()

# Filter only non_duplicate_users
df3 = df2[~df2['user_id'].isin(df_query1['user_id'])]

#Amostragem
df_treatment = df3.loc[df3['group'] == 'treatment',:].sample(sample_n, random_state=42)
df_control = df3.loc[df3['group'] == 'control',:].sample(sample_n, random_state=42)

print(f'Size of control Group: {df_control.shape[0]}')
print(f'Size of treatment Group: {df_treatment.shape[0]}')

df_ab = pd.concat([df_control, df_treatment])

#Taxa de conversão
converted = df_control['converted'].sum()
converted_ratio_control = converted/len(df_control)

print(f'Control Convertion Rate: {converted_ratio_control}')

converted = df_treatment['converted'].sum()
converted_ratio_treatment = converted/len(df_treatment)

print(f'Treatment Convertion Rate: {converted_ratio_treatment}')

#Teste de Hipótese
df_table = df_ab.loc[:,['group','converted']].groupby('group').agg({'converted':['sum','count']})
df_table.columns = ['converted','non_converted']


chi_val, pval, dof, expected = chi2_contingency(df_table)

print(f'p-value: {pval:.2f}')


#Conclusão
if pval < significance_level:
    print('Rejeita a Hipótese Nula')
else:
    print('Falha em Rejeitar a Hipótese Nula')

Size of control Group: 4720
Size of treatment Group: 4720
Control Convertion Rate: 0.11546610169491525
Treatment Convertion Rate: 0.11313559322033899
p-value: 0.78
Falha em Rejeitar a Hipótese Nula


In [14]:
#Conversão de Resultado para R$
pagina_atual = 13%
pagina_nova = 15%

SyntaxError: invalid syntax (1006606746.py, line 2)

In [15]:
compradores = numero_visitante * 0.13
GMV = compradores*4500

NameError: name 'numero_visitante' is not defined

In [19]:
df4.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21,control,old_page,0
1,804228,2017-01-12,control,old_page,0
2,661590,2017-01-11,treatment,new_page,0
3,853541,2017-01-08,treatment,new_page,0
4,864975,2017-01-21,control,old_page,1


In [21]:
df4 = df3.copy()

df4['timestamp'] = pd.to_datetime(df4['timestamp']).apply(lambda x: x.strftime('%Y-%m-%d')) 

df5 = df4[['timestamp','user_id']].groupby('timestamp').count().reset_index()

In [29]:
# Current GMV
df5['current_purchases'] = (df5['user_id']*0.13).astype(int)

df5['current_GMV'] = df5['current_purchases'] * 4500

current_gmv = df5['current_GMV'].sum()

print(f"Current GMV: {current_gmv}")

#Expected GMV
df5['new_purchases'] = (df5['user_id']*0.15).astype(int)

df5['new_GMV'] = df5['new_purchases'] * 4500

new_gmv = df5['new_GMV'].sum()

print(f"New GMV: {new_gmv}")

lift = (new_gmv - current_gmv)/current_gmv
lift_abs = new_gmv - current_gmv

print(f"Lift Difference: {lift_abs}")

print(f"Expected Lift: {100*lift:.2f}%")

Current GMV: 167656500
New GMV: 193459500
Lift Difference: 25803000
Expected Lift: 15.39%
