# 0.0 Imports

In [26]:
import pandas as pd
import numpy as np
import math

from statsmodels.stats import api as sms
from scipy.stats import chi2_contingency

## 0.1 Loading

In [2]:
path = 'C:/Users/edils/repos/teste_ab/data/'

df_raw = pd.read_csv(path + 'ab_data.csv')

In [3]:
df_raw.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


# 1.0 Design do Experimento

## 1.1 Formulação das Hipóteses

In [None]:
# H0 : A conversão da nova página é de 13%
# H1 : A conversão da nova página é diferente de 15%

# 2.0 Parametros do Experimento

In [4]:
#nivel de confiança
confidence_level = 0.95

#nivel de significancia
significance_level = 0.05

#conversoes da pagina atual e nova
p1 = 0.13
p2 = 0.15

#tamanho do efeito
effect_size = sms.proportion_effectsize(p1, p2)

#poder Estatistico
power = 0.80

In [5]:
#Tamanho da amostra
sample_n = sms.NormalIndPower().solve_power(
            effect_size,
            power=power,
            alpha=significance_level
)

sample_n = math.ceil(sample_n)

In [6]:
print(f"O tamanho total da amostra é: {2*sample_n}")
print(f"O tamanho da amostra do grupo de controle é: {sample_n}")
print(f"O tamanho da amostra do grupo de tratamento é: {sample_n}")

O tamanho total da amostra é: 9440
O tamanho da amostra do grupo de controle é: 4720
O tamanho da amostra do grupo de tratamento é: 4720


In [30]:
# Preparar dos dados
# Divide duplicate users based on group and landing_page
df_query1 = df2.loc[:, ['user_id', 'group', 'landing_page']].groupby('user_id').nunique().query('landing_page > 1 or group > 1').reset_index()

# Filter only non_duplicate_users
df3 = df2[~df2['user_id'].isin(df_query1['user_id'])]

#Amostragem
df_treatment = df3.loc[df3['group'] == 'treatment',:].sample(sample_n, random_state=42)
df_control = df3.loc[df3['group'] == 'control',:].sample(sample_n, random_state=42)

print(f'Size of control Group: {df_control.shape[0]}')
print(f'Size of treatment Group: {df_treatment.shape[0]}')

df_ab = pd.concat([df_control, df_treatment])

#Taxa de conversão
converted = df_control['converted'].sum()
converted_ratio_control = converted/len(df_control)

print(f'Control Convertion Rate: {converted_ratio_control}')

converted = df_treatment['converted'].sum()
converted_ratio_treatment = converted/len(df_treatment)

print(f'Treatment Convertion Rate: {converted_ratio_treatment}')

#Teste de Hipótese
df_table = df_ab.loc[:,['group','converted']].groupby('group').agg({'converted':['sum','count']})
df_table.columns = ['converted','non_converted']


chi_val, pval, dof, expected = chi2_contingency(df_table)

print(f'p-value: {pval:.2f}')


#Conclusão
if pval < significance_level:
    print('Rejeita a Hipótese Nula')
else:
    print('Falha em Rejeitar a Hipótese Nula')

Size of control Group: 4720
Size of treatment Group: 4720
Control Convertion Rate: 0.11546610169491525
Treatment Convertion Rate: 0.11313559322033899
p-value: 0.78
Falha em Rejeitar a Hipótese Nula


In [None]:
#Conversão de Resultado para R$
pagina_atual = 13%
pagina_nova = 15%

In [None]:
compradores = numero_visitante * 0.13
GMV = compradores*4500

In [36]:
df4.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [35]:
df4 = df3.copy()

df4['timestamp'] = pd.to_datetime(df4['timestamp']) 



# 2.0 Análise Descritiva

In [7]:
df2 = df_raw.copy()

## 2.1 Tamanho

In [8]:
df2.shape

(294478, 5)

## 2.2 Check Na

In [9]:
df2.isna().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

## 2.3 Flags

In [10]:
#Check for duplicates
df2.loc[:,['user_id','landing_page','group']].groupby(['user_id']).nunique().query('landing_page > 1 or group > 1').reset_index()

Unnamed: 0,user_id,landing_page,group
0,630052,2,1
1,630126,2,1
2,630137,2,1
3,630320,1,2
4,630471,2,1
...,...,...,...
3888,945627,1,2
3889,945645,2,1
3890,945703,2,1
3891,945797,2,1


In [12]:
df3.shape

(286692, 5)

# 3.0 Amostragem Aleatoria dos grupos de Controle e Tratamento

In [13]:
df_control = df3.loc[df3['group'] == 'control',:].sample(sample_n, random_state=42)

df_treatment = df3.loc[df3['group'] == 'treatment',:].sample(sample_n, random_state=42)

print(f'Size of Control: {df_control.shape[0]}')
print(f'Size of Treatment: {df_control.shape[0]}')

Size of Control: 4720
Size of Treatment: 4720


## 3.1 Cálculo da métrica de interesse entre os grupos(Conversão de cada página)

In [14]:
#Control Group
control_buyers = df_control['converted'].sum()

control_visitors = len(df_control)

print(f"Conversion rate - Control Group: {control_buyers/control_visitors}" )
  
#Treatment Group      
treatment_buyers = df_treatment['converted'].sum()

treatment_visitors = len(df_treatment)

print(f"Conversion rate - Treament Group: {treatment_buyers/treatment_visitors}" )      

Conversion rate - Control Group: 0.11546610169491525
Conversion rate - Treament Group: 0.11313559322033899
