In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np
from statsmodels.stats.proportion import proportions_ztest

In [2]:
file = r"..\data\clean\complete_clean.csv"
file_test = r"..\data\clean\test_group_clean.csv"
file_control = r"..\data\clean\control_group_clean.csv"

df_all = pd.read_csv(file)
df_test = pd.read_csv(file_test)
df_control =  pd.read_csv(file_control)
df_demo = pd.read_csv('../data/clean/final_demo_clean.csv')

In [3]:
#cross tab des variations par step 

crosstab_all = pd.crosstab(df_all["process_step"], df_all["variation"])

#percentage per step -> Completion Rate
control_counts = crosstab_all['control']
test_counts = crosstab_all['test']

control_start = control_counts.iloc[0]
test_start = test_counts.iloc[0]

print(control_start)
print(test_start)

control_percentages = [count / control_start * 100 for count in control_counts]
test_percentages = [count / test_start * 100 for count in test_counts]

print("Control :", control_percentages)
print("Test :", test_percentages)

47608
56474
Control : [100.0, 62.31725760376408, 54.3732145857839, 47.88060830112586, 36.76903041505629]
Test : [100.0, 68.64751921238091, 54.87303892056522, 45.851188157382154, 45.54662322484683]


## Taux de complétion

Hypothèse : Comparer le taux de complétion entre le groupe Test (nouveau design) et le groupe Contrôle (ancien design).

Objectif : Vérifier si la différence est statistiquement significative.

Actions : Définir les hypothèses nulle et alternative, tester avec le niveau de signification approprié, analyser le p-value et d'autres mesures statistiques.

Hypothèse nulle (H0) : Il y a une différence significative dans le taux de complétion entre le groupe Test (nouveau design) et le groupe Contrôle (ancien design).

Hypothèse alternative (H1) : Il n'y a pas de différence significative dans le taux de complétion entre le groupe Test (nouveau design) et le groupe Contrôle (ancien design).

In [4]:
# Example data
n_test = test_start  # Sample size for the test group
n_control = control_start  # Sample size for the control group

# Completion rates
completion_rate_test = test_percentages[-1] / 100
completion_rate_control = control_percentages[-1] / 100

# Number of successes (completions)
successes_test = n_test * completion_rate_test
successes_control = n_control * completion_rate_control

# Perform the z-test for proportions
count = [successes_test, successes_control]
nobs = [n_test, n_control]

stat, p_value = proportions_ztest(count, nobs, alternative='larger')

# Print results
print(f"z-statistic: {stat}")
print(f"p-value: {p_value}")

# Interpretation of p-value
if p_value < 0.05:
    print("Reject the null hypothesis: The test group has a significantly higher completion rate than the control group.")
else:
    print("Fail to reject the null hypothesis: No significant difference in completion rates.")


z-statistic: 28.628711797088698
p-value: 1.4757732634476226e-180
Reject the null hypothesis: The test group has a significantly higher completion rate than the control group.


## Temps de complétion 

Vérifier si le groupe de test prend en moyenne moins de temps à compléter le processus que le groupe de control 

In [6]:
from datetime import time, timedelta 
untouched_df_all = df_all.copy()
df_all[df_all['process_step'] == 'start'] = df_all[df_all['process_step'] == 'start'].drop_duplicates(subset=['client_id', 'visit_id', 'process_step'], keep='last')
df_all[df_all['process_step'] == 'step_1'] = df_all[df_all['process_step'] == 'step_1'].drop_duplicates(subset=['client_id', 'visit_id', 'process_step'], keep='last')
df_all[df_all['process_step'] == 'step_2'] = df_all[df_all['process_step'] == 'step_2'].drop_duplicates(subset=['client_id', 'visit_id', 'process_step'], keep='last')
df_all[df_all['process_step'] == 'step_3'] = df_all[df_all['process_step'] == 'step_3'].drop_duplicates(subset=['client_id', 'visit_id', 'process_step'], keep='last')
df_all[df_all['process_step'] == 'validate'] = df_all[df_all['process_step'] == 'validate'].drop_duplicates(subset=['client_id', 'visit_id', 'process_step'], keep='last')
df_all = df_all.reset_index(drop=True)
df_all['date_time'] = pd.to_datetime(df_all['date_time'], format="ISO8601")
df_all = df_all.sort_values(by=['client_id', 'visit_id', 'process_step', 'date_time'])


for i in range(1, df_all.shape[0]):
    if df_all.loc[i-1, 'visit_id'] == df_all.loc[i, 'visit_id']:
        df_all.loc[i, 'time_delta'] = df_all.loc[i, 'date_time'] - df_all.loc[i-1, 'date_time']
    else:
        df_all.loc[i, 'time_delta'] = None

#df_all = df_all[df_all['time_diff'].notnull()]

df_all = df_all.reset_index(drop=True)

# df_all['date_time'] = pd.to_datetime(df_all['date'] + ' ' + df_all['time'])

# df_all['time_diff_seconds'] = df_all['time_diff'].dt.total_seconds()
# df_all['time_diff_seconds'].describe()

In [50]:
test_group = df_all[df_all['variation'] == 'test']
control_group = df_all[df_all['variation'] == 'control']

In [22]:
test_group.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,date,time,variation,time_delta
0,555.0,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,2017-04-15,12:57:56,test,NaT
1,555.0,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,2017-04-15,12:58:03,test,0 days 00:00:07
2,555.0,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,2017-04-15,12:58:35,test,0 days 00:00:32
3,555.0,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,2017-04-15,13:00:14,test,0 days 00:01:39
4,555.0,402506806_56087378777,637149525_38041617439_716659,validate,2017-04-15 13:00:34,2017-04-15,13:00:34,test,0 days 00:00:20


In [51]:
test_group = test_group.drop(columns=['client_id', 'visitor_id', 'visit_id', 'variation', 'date_time', 'date', 'time'])
test_group['time_delta'] = test_group['time_delta'].dt.total_seconds()

In [52]:
control_group = control_group.drop(columns=['client_id', 'visitor_id', 'visit_id', 'variation', 'date_time', 'date', 'time'])
control_group['time_delta'] = control_group['time_delta'].dt.total_seconds()

In [58]:
test_group.groupby(by='process_step').mean()

Unnamed: 0_level_0,time_delta
process_step,Unnamed: 1_level_1
start,
step_1,3.526475
step_2,37.325393
step_3,92.851024
validate,143.254736


In [31]:
control_group.groupby(by='process_step').mean()

Unnamed: 0_level_0,time_delta
process_step,Unnamed: 1_level_1
start,
step_1,18.491384
step_2,34.409333
step_3,95.511871
validate,136.016508


In [60]:
control_group.shape

(109172, 2)

In [56]:
test_group.shape

(130159, 2)

## Hypothesis 
H0 Mean time spent for step 1 for control group is greater or equal than time spent for test 1 for test group

In [62]:
import math

# Given summary statistics
n1 = control_group.shape[0]  # Sample size for Process A
mean1 = control_group.groupby(by='process_step').mean().values[1]  # Mean for Process A
std1 = control_group.groupby(by='process_step').std().values[1]  # Standard deviation for Process A

n2 = test_group.shape[0]  # Sample size for Process B
mean2 = test_group.groupby(by='process_step').mean().values[1]  # Mean for Process B
std2 = test_group.groupby(by='process_step').std().values[1]  # Standard deviation for Process B

# Calculate the t-statistic
t_stat = (mean1 - mean2) / math.sqrt((std1**2 / n1) + (std2**2 / n2))

# Calculate the degrees of freedom (using the formula above)
df = ((std1**2 / n1 + std2**2 / n2) ** 2) / ((std1**2 / n1) ** 2 / (n1 - 1) + (std2**2 / n2) ** 2 / (n2 - 1))

# Calculate the p-value for a one-tailed test (right-tailed)
p_value = st.t.sf(t_stat, df)  # One-tailed test, survival function (right tail)

# Output results
print("T-statistic:", t_stat)
print("Degrees of freedom:", df)
print("P-value:", p_value)

# Decision based on the p-value (assuming a 0.05 significance level)
alpha = 0.05
if p_value > alpha:
    print("Reject the null hypothesis: Mean of Process A is significantly less than the mean of Process B.")
else:
    print("Fail to reject the null hypothesis: There is insufficient evidence to conclude Process A mean is less than Process B.")



T-statistic: [13.15910311]
Degrees of freedom: [222003.28253803]
P-value: [7.8052604e-40]
Fail to reject the null hypothesis: There is insufficient evidence to conclude Process A mean is less than Process B.


  t_stat = (mean1 - mean2) / math.sqrt((std1**2 / n1) + (std2**2 / n2))


In [63]:
# Given summary statistics
n1 = control_group.shape[0]  # Sample size for Process A
mean1 = control_group.groupby(by='process_step').mean().values[2]  # Mean for Process A
std1 = control_group.groupby(by='process_step').std().values[2]  # Standard deviation for Process A

n2 = test_group.shape[0]  # Sample size for Process B
mean2 = test_group.groupby(by='process_step').mean().values[2]  # Mean for Process B
std2 = test_group.groupby(by='process_step').std().values[2]  # Standard deviation for Process B

# Calculate the t-statistic
t_stat = (mean1 - mean2) / math.sqrt((std1**2 / n1) + (std2**2 / n2))

# Calculate the degrees of freedom (using the formula above)
df = ((std1**2 / n1 + std2**2 / n2) ** 2) / ((std1**2 / n1) ** 2 / (n1 - 1) + (std2**2 / n2) ** 2 / (n2 - 1))

# Calculate the p-value for a one-tailed test (right-tailed)
p_value = st.t.sf(t_stat, df)  # One-tailed test, survival function (right tail)

# Output results
print("T-statistic:", t_stat)
print("Degrees of freedom:", df)
print("P-value:", p_value)

# Decision based on the p-value (assuming a 0.05 significance level)
alpha = 0.05
if p_value > alpha:
    print("Reject the null hypothesis: Mean of Process A is significantly less than the mean of Process B.")
else:
    print("Fail to reject the null hypothesis: There is insufficient evidence to conclude Process A mean is less than Process B.")


T-statistic: [-3.77448937]
Degrees of freedom: [236522.31077546]
P-value: [0.99991981]
Reject the null hypothesis: Mean of Process A is significantly less than the mean of Process B.


  t_stat = (mean1 - mean2) / math.sqrt((std1**2 / n1) + (std2**2 / n2))


In [64]:
# Given summary statistics
n1 = control_group.shape[0]  # Sample size for Process A
mean1 = control_group.groupby(by='process_step').mean().values[3]  # Mean for Process A
std1 = control_group.groupby(by='process_step').std().values[3]  # Standard deviation for Process A

n2 = test_group.shape[0]  # Sample size for Process B
mean2 = test_group.groupby(by='process_step').mean().values[3]  # Mean for Process B
std2 = test_group.groupby(by='process_step').std().values[3]  # Standard deviation for Process B

# Calculate the t-statistic
t_stat = (mean1 - mean2) / math.sqrt((std1**2 / n1) + (std2**2 / n2))

# Calculate the degrees of freedom (using the formula above)
df = ((std1**2 / n1 + std2**2 / n2) ** 2) / ((std1**2 / n1) ** 2 / (n1 - 1) + (std2**2 / n2) ** 2 / (n2 - 1))

# Calculate the p-value for a one-tailed test (right-tailed)
p_value = st.t.sf(t_stat, df)  # One-tailed test, survival function (right tail)

# Output results
print("T-statistic:", t_stat)
print("Degrees of freedom:", df)
print("P-value:", p_value)

# Decision based on the p-value (assuming a 0.05 significance level)
alpha = 0.05
if p_value > alpha:
    print("Reject the null hypothesis: Mean of Process A is significantly less than the mean of Process B.")
else:
    print("Fail to reject the null hypothesis: There is insufficient evidence to conclude Process A mean is less than Process B.")


T-statistic: [3.52570342]
Degrees of freedom: [232853.64649453]
P-value: [0.00021122]
Fail to reject the null hypothesis: There is insufficient evidence to conclude Process A mean is less than Process B.


  t_stat = (mean1 - mean2) / math.sqrt((std1**2 / n1) + (std2**2 / n2))


In [65]:
# Given summary statistics
n1 = control_group.shape[0]  # Sample size for Process A
mean1 = control_group.groupby(by='process_step').mean().values[4]  # Mean for Process A
std1 = control_group.groupby(by='process_step').std().values[4]  # Standard deviation for Process A

n2 = test_group.shape[0]  # Sample size for Process B
mean2 = test_group.groupby(by='process_step').mean().values[4]  # Mean for Process B
std2 = test_group.groupby(by='process_step').std().values[4]  # Standard deviation for Process B

# Calculate the t-statistic
t_stat = (mean1 - mean2) / math.sqrt((std1**2 / n1) + (std2**2 / n2))

# Calculate the degrees of freedom (using the formula above)
df = ((std1**2 / n1 + std2**2 / n2) ** 2) / ((std1**2 / n1) ** 2 / (n1 - 1) + (std2**2 / n2) ** 2 / (n2 - 1))

# Calculate the p-value for a one-tailed test (right-tailed)
p_value = st.t.sf(t_stat, df)  # One-tailed test, survival function (right tail)

# Output results
print("T-statistic:", t_stat)
print("Degrees of freedom:", df)
print("P-value:", p_value)

# Decision based on the p-value (assuming a 0.05 significance level)
alpha = 0.05
if p_value > alpha:
    print("Reject the null hypothesis: Mean of Process A is significantly less than the mean of Process B.")
else:
    print("Fail to reject the null hypothesis: There is insufficient evidence to conclude Process A mean is less than Process B.")


T-statistic: [-6.03532505]
Degrees of freedom: [221322.4774546]
P-value: [1.]
Reject the null hypothesis: Mean of Process A is significantly less than the mean of Process B.


  t_stat = (mean1 - mean2) / math.sqrt((std1**2 / n1) + (std2**2 / n2))
