# VANGUARD AB TEST


## METADATA HELP

This comprehensive set of fields will guide your analysis, helping you unravel the intricacies of client behavior and preferences.

- **client_id**: Every client’s unique ID.
- **variation**: Indicates if a client was part of the experiment.
- **visitor_id**: A unique ID for each client-device combination.
- **visit_id**: A unique ID for each web visit/session.
- **process_step**: Marks each step in the digital process.
- **date_time**: Timestamp of each web activity.
- **clnt_tenure_yr**: Represents how long the client has been with Vanguard, measured in years.
- **clnt_tenure_mnth**: Further breaks down the client’s tenure with Vanguard in months.
- **clnt_age**: Indicates the age of the client.
- **gendr**: Specifies the client’s gender.
- **num_accts**: Denotes the number of accounts the client holds with Vanguard.
- **bal**: Gives the total balance spread across all accounts for a particular client.
- **calls_6_mnth**: Records the number of times the client reached out over a call in the past six months.
- **logons_6_mnth**: Reflects the frequency with which the client logged onto Vanguard’s platform over the last six months.


In [1]:
%load_ext autoreload
%autoreload 2 

In [2]:
from cleaning import *
from mining import *
from db_handling import *
import pandas as pd
from dotenv import load_dotenv
import os
import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
# Load environment variables
load_dotenv()

True

### Load Configuration

In [4]:
# Load config.yaml
config = parse_config()

{'database_name': 'vanguard_ab', 'refresh_db': False, 'tables': {'clients': {'sources': [{'path': 'data/df_final_demo.txt', 'url': 'https://raw.githubusercontent.com/data-bootcamp-v4/lessons/refs/heads/main/5_6_eda_inf_stats_tableau/project/files_for_project/df_final_demo.txt'}], 'separator': ',', 'dropna': True, 'columns': {'client_id': {'original_name': 'client_id', 'data_type': 'INTEGER', 'primary_key': True, 'pandas_dtype': 'int64'}, 'client_since_month': {'original_name': 'clnt_tenure_mnth', 'data_type': 'FLOAT', 'pandas_dtype': 'int64'}, 'client_age': {'original_name': 'clnt_age', 'data_type': 'FLOAT', 'pandas_dtype': 'int64'}, 'gender': {'original_name': 'gendr', 'data_type': 'CHAR(1)', 'pandas_dtype': 'category', 'valid_categories': ['U', 'M', 'F'], 'fallback_category': 'U'}, 'number_of_accounts': {'original_name': 'num_accts', 'data_type': 'FLOAT', 'pandas_dtype': 'int64'}, 'balance': {'original_name': 'bal', 'data_type': 'FLOAT', 'pandas_dtype': 'float64'}, 'calls_6_months': 

## Data Mining

In [5]:
# Creates a dictionary of all imported dataframes
dataframes = { name:import_data_from_config(config, name) for name in config['tables']}

## Data Cleaning

In [6]:
#TODO: don't impose categories?

In [7]:
# Rename columns
dataframes = rename_columns(dataframes, config)

In [8]:
# Select columns
dataframes = select_columns(dataframes, config)

In [9]:
display_dataFrames(dataframes,'head')

clients:
clients - Head:


Unnamed: 0,client_id,client_since_month,client_age,gender,number_of_accounts,balance,calls_6_months,logons_6_month
0,836976,73.0,60.5,U,2.0,45105.3,6.0,9.0
1,2304905,94.0,58.0,U,2.0,110860.3,6.0,9.0
2,1439522,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,145.0,33.0,F,2.0,103671.75,0.0,3.0


experiment:
experiment - Head:


Unnamed: 0,client_id,variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control


visits:
visits - Head:


Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04


## Separation

In [10]:
client_df = dataframes['clients']
experiment_df = dataframes['experiment']
visits_df = dataframes['visits']
display(experiment_df['variation'].isna().sum())
display (client_df, experiment_df, visits_df)

20109

Unnamed: 0,client_id,client_since_month,client_age,gender,number_of_accounts,balance,calls_6_months,logons_6_month
0,836976,73.0,60.5,U,2.0,45105.30,6.0,9.0
1,2304905,94.0,58.0,U,2.0,110860.30,6.0,9.0
2,1439522,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,145.0,33.0,F,2.0,103671.75,0.0,3.0
...,...,...,...,...,...,...,...,...
70604,7993686,56.0,38.5,U,3.0,1411062.68,5.0,5.0
70605,8981690,148.0,31.0,M,2.0,101867.07,6.0,6.0
70606,333913,198.0,61.5,F,2.0,40745.00,3.0,3.0
70607,1573142,255.0,68.0,M,3.0,475114.69,4.0,4.0


Unnamed: 0,client_id,variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control
...,...,...
70604,2443347,
70605,8788427,
70606,266828,
70607,1266421,


Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04
...,...,...,...,...,...
755400,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:46:10
755401,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:45:29
755402,9668240,388766751_9038881013,922267647_3096648104_968866,step_1,2017-05-24 18:44:51
755403,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:44:34


In [11]:
variation_visits = visits_df.merge(experiment_df, on='client_id')
display(variation_visits['variation'].value_counts())
display(variation_visits)
confirmed_steps = variation_visits[variation_visits['process_step'] == 'confirm']
display(confirmed_steps)
unique_visit_ids = confirmed_steps.drop_duplicates(subset='visit_id')
display(unique_visit_ids)
""" visits = variation_visits.groupby(['variation','process_step']).agg({'process_step':'count'})
visits """


variation
Test       177847
Control    143462
Name: count, dtype: int64

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07,Test
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51,Test
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22,Test
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13,Test
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04,Test
...,...,...,...,...,...,...
449826,9895983,473024645_56027518531,498981662_93503779869_272484,step_3,2017-06-15 19:52:09,
449827,9895983,473024645_56027518531,498981662_93503779869_272484,step_2,2017-06-15 19:50:37,
449828,9895983,473024645_56027518531,498981662_93503779869_272484,step_1,2017-06-15 19:50:05,
449829,9895983,473024645_56027518531,498981662_93503779869_272484,start,2017-06-15 19:50:00,


Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation
15,8320017,39393514_33118319366,960651974_70596002104_312201,confirm,2017-04-05 13:10:05,Test
20,4033851,762728880_76361333336,949661017_22392791362_127391,confirm,2017-04-05 12:29:03,Control
21,4033851,762728880_76361333336,949661017_22392791362_127391,confirm,2017-04-05 12:29:01,Control
22,4033851,762728880_76361333336,949661017_22392791362_127391,confirm,2017-04-05 12:28:52,Control
35,1982004,618934751_10082078917,132494692_73738638511_70823,confirm,2017-04-17 12:02:42,Test
...,...,...,...,...,...,...
449803,104557,964403920_33362996723,130036090_50724060621_44337,confirm,2017-05-12 09:26:38,
449804,104557,964403920_33362996723,130036090_50724060621_44337,confirm,2017-05-12 09:25:48,
449812,1672289,658797407_91003097342,430110748_90347503221_597628,confirm,2017-05-01 19:31:40,
449818,1020453,433909361_21130313633,310496685_94413045985_207142,confirm,2017-05-03 12:41:20,


Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation
15,8320017,39393514_33118319366,960651974_70596002104_312201,confirm,2017-04-05 13:10:05,Test
20,4033851,762728880_76361333336,949661017_22392791362_127391,confirm,2017-04-05 12:29:03,Control
35,1982004,618934751_10082078917,132494692_73738638511_70823,confirm,2017-04-17 12:02:42,Test
48,3170143,837774974_91015515080,758001412_25057963943_219559,confirm,2017-04-04 18:41:44,Test
56,6651403,387438258_49845138949,754106292_61370647593_160929,confirm,2017-04-04 17:38:39,Control
...,...,...,...,...,...,...
449785,9979751,29109363_8704796230,872818621_23600592651_169070,confirm,2017-05-30 10:54:13,
449803,104557,964403920_33362996723,130036090_50724060621_44337,confirm,2017-05-12 09:26:38,
449812,1672289,658797407_91003097342,430110748_90347503221_597628,confirm,2017-05-01 19:31:40,
449818,1020453,433909361_21130313633,310496685_94413045985_207142,confirm,2017-05-03 12:41:20,


" visits = variation_visits.groupby(['variation','process_step']).agg({'process_step':'count'})\nvisits "

In [12]:

display(unique_visit_ids['variation'].value_counts())

variation
Test       21675
Control    16002
Name: count, dtype: int64

In [13]:
# drop the nulls from clients, but keep the list of the drops

nulls_client_id = client_df[client_df.isna().any(axis=1)]['client_id']
nulls_client_id

4164     7402828
8316      355337
8677     8412164
9583     4666211
13444    2222915
18066    4876926
25961    5277910
28432    7616759
35323    8191345
43518    1227228
46076    8611797
47189    5144725
49846    1037867
63701    1618749
66219    6021001
Name: client_id, dtype: int64

In [14]:
client_df = client_df.dropna(axis=0)
client_df

Unnamed: 0,client_id,client_since_month,client_age,gender,number_of_accounts,balance,calls_6_months,logons_6_month
0,836976,73.0,60.5,U,2.0,45105.30,6.0,9.0
1,2304905,94.0,58.0,U,2.0,110860.30,6.0,9.0
2,1439522,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,145.0,33.0,F,2.0,103671.75,0.0,3.0
...,...,...,...,...,...,...,...,...
70604,7993686,56.0,38.5,U,3.0,1411062.68,5.0,5.0
70605,8981690,148.0,31.0,M,2.0,101867.07,6.0,6.0
70606,333913,198.0,61.5,F,2.0,40745.00,3.0,3.0
70607,1573142,255.0,68.0,M,3.0,475114.69,4.0,4.0


In [15]:
display(client_df['gender'].value_counts(dropna = False))
# x->u, keep 'U's for everything except the gender statistics

gender
U    24122
M    23724
F    22745
X        3
Name: count, dtype: int64

In [16]:
#client_df['gender'] = client_df['gender'].replace(to_replace=r'.*X.*', value ="U", regex=True)

In [17]:
display(experiment_df['variation'].value_counts(dropna = False))
# keep NaN for general analysis of clients, but drop them from everywhere for test analysis

variation
Test       26968
Control    23532
NaN        20109
Name: count, dtype: int64

In [18]:
# client_df, experiment_df, visit_df -> for general analysis
# new_client_df, new_experiment_df, new_visit_dfn -> for test/control analysis   experiment_df_null = 
nulls_in_experiment = experiment_df[experiment_df.isna().any(axis=1)]['client_id']
nulls_in_experiment

50500    5459747
50501    8031000
50502    1847030
50503    9713157
50504    7775828
          ...   
70604    2443347
70605    8788427
70606     266828
70607    1266421
70608    9895983
Name: client_id, Length: 20109, dtype: int64

In [19]:
# new df removing client ID that are null in experiment

new_experiment_df = experiment_df[~experiment_df['client_id'].isin(nulls_in_experiment)]
display(new_experiment_df.count())

new_experiment_df = new_experiment_df[~new_experiment_df['client_id'].isin(nulls_client_id)]
display(new_experiment_df.count())



client_id    50500
variation    50500
dtype: int64

client_id    50487
variation    50487
dtype: int64

In [20]:
new_visits_df = visits_df[~visits_df['client_id'].isin(nulls_in_experiment)]
new_visits_df

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04
...,...,...,...,...,...
755400,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:46:10
755401,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:45:29
755402,9668240,388766751_9038881013,922267647_3096648104_968866,step_1,2017-05-24 18:44:51
755403,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:44:34


In [21]:
new_client_df = client_df[~client_df['client_id'].isin(nulls_in_experiment)]
new_client_df

Unnamed: 0,client_id,client_since_month,client_age,gender,number_of_accounts,balance,calls_6_months,logons_6_month
0,836976,73.0,60.5,U,2.0,45105.30,6.0,9.0
1,2304905,94.0,58.0,U,2.0,110860.30,6.0,9.0
2,1439522,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,145.0,33.0,F,2.0,103671.75,0.0,3.0
...,...,...,...,...,...,...,...,...
50495,1780858,262.0,68.5,M,3.0,372100.59,6.0,9.0
50496,6967120,260.0,68.5,M,3.0,4279873.38,6.0,9.0
50497,5826160,249.0,56.5,F,2.0,44837.16,2.0,5.0
50498,8739285,229.0,69.5,F,2.0,44994.24,1.0,4.0


## End separation

In [22]:
dataframes['clients'] = new_client_df.copy()
dataframes['experiment'] = new_experiment_df.copy()
dataframes['visits'] = new_visits_df.copy()

In [23]:
# Data Categorizing
dataframes = clean_categorical_data(dataframes, config)

In [24]:
#Convert types
dataframes = convert_types(dataframes, config)

In [25]:
display_dataFrames(dataframes, 'head', 'dtypes', 'cat_count')

clients:
clients - Head:


Unnamed: 0,client_id,client_since_month,client_age,gender,number_of_accounts,balance,calls_6_months,logons_6_month
0,836976,73,60,U,2,45105.3,6,9
1,2304905,94,58,U,2,110860.3,6,9
2,1439522,64,32,U,2,52467.79,6,9
3,1562045,198,49,M,2,67454.65,3,6
4,5126305,145,33,F,2,103671.75,0,3


clients - Dtypes:


client_id                int64
client_since_month       int64
client_age               int64
gender                category
number_of_accounts       int64
balance                float64
calls_6_months           int64
logons_6_month           int64
dtype: object

clients - Cat_count:


Unnamed: 0_level_0,gender
gender,Unnamed: 1_level_1
U,17282
M,16947
F,16258


experiment:
experiment - Head:


Unnamed: 0,client_id,variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control


experiment - Dtypes:


client_id       int64
variation    category
dtype: object

experiment - Cat_count:


Unnamed: 0_level_0,variation
variation,Unnamed: 1_level_1
Test,26961
Control,23526


visits:
visits - Head:


Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04


visits - Dtypes:


client_id                int64
visitor_id              object
visit_id                object
process_step          category
date_time       datetime64[ns]
dtype: object

visits - Cat_count:


Unnamed: 0_level_0,process_step
process_step,Unnamed: 1_level_1
start,202439
step_1,135733
step_2,110366
step_3,92714
confirm,85631


In [26]:
client_df = dataframes['clients']
experiment_df = dataframes['experiment']
visits_df = dataframes['visits']


### SQL EXPORT

In [27]:
if config['refresh_db']:

    db_password = os.getenv('SQL_PASSWORD')

    # Create database if it doesn't exist
    engine = create_db(db_password, config)

    # Export tables to database if refresh is set to true
    export_dataframes_to_sql(engine, dataframes)

    # Import data from database
    dataframes = import_all_tables_from_sql(engine)

### Local Caching

In [28]:
""" # Save files locally in an untracked folder
export_dataframes_to_csv(dataframes) """

' # Save files locally in an untracked folder\nexport_dataframes_to_csv(dataframes) '

In [29]:
#TODO CAREFUL DATA WONT BE PROPERLY CATEGORIZED / TYPED run after : convert_types(dataframes, config)
""" clients_df = pd.read_csv('data/cleaned/clients.csv')
experiment_df = pd.read_csv('data/cleaned/experiment.csv')
visits_df = pd.read_csv('data/cleaned/visits.csv') """

" clients_df = pd.read_csv('data/cleaned/clients.csv')\nexperiment_df = pd.read_csv('data/cleaned/experiment.csv')\nvisits_df = pd.read_csv('data/cleaned/visits.csv') "

## CLEAN FRAMES

In [30]:
display('clients :',client_df, 'experiment :',experiment_df, 'visits :',visits_df)

experiment_df['variation'].value_counts()


'clients :'

Unnamed: 0,client_id,client_since_month,client_age,gender,number_of_accounts,balance,calls_6_months,logons_6_month
0,836976,73,60,U,2,45105.30,6,9
1,2304905,94,58,U,2,110860.30,6,9
2,1439522,64,32,U,2,52467.79,6,9
3,1562045,198,49,M,2,67454.65,3,6
4,5126305,145,33,F,2,103671.75,0,3
...,...,...,...,...,...,...,...,...
50495,1780858,262,68,M,3,372100.59,6,9
50496,6967120,260,68,M,3,4279873.38,6,9
50497,5826160,249,56,F,2,44837.16,2,5
50498,8739285,229,69,F,2,44994.24,1,4


'experiment :'

Unnamed: 0,client_id,variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control
...,...,...
50495,393005,Control
50496,2908510,Control
50497,7230446,Test
50498,5230357,Test


'visits :'

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04
...,...,...,...,...,...
755400,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:46:10
755401,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:45:29
755402,9668240,388766751_9038881013,922267647_3096648104_968866,step_1,2017-05-24 18:44:51
755403,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:44:34


variation
Test       26961
Control    23526
Name: count, dtype: int64

In [31]:
# client_since_year : redundant : drop
# client_since_month: hypothesis : the longer they are client, the more valuable to us
# client_since_month: hypothesis : the older the client is, the more valuable to us
# gender: hypothesis : the men have more balance
# number_of_accounts: hypothesis : the clients with more accounts have more balance
# calls + logons : hypothesis : active clients are more valuable to us

# process steps + time : 
    # - SUCCESS : All the steps, in order, in a reasonable amount of time for each step
    
    # - ERROR : path do not start with start : drop
    # - ERROR : path do not complete : analyse
    # - ERROR : path do not complete in order: analyse
    # - ERROR : All the steps in order but took very long
    # - ERROR : Unusual amount of time between steps

## Data Exploration

In [32]:
# Added variation column to visits for easier analysis
visits_variations = visits_df.merge(experiment_df, on='client_id', how='right')
display(visits_variations)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07,Test
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51,Test
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22,Test
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13,Test
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04,Test
...,...,...,...,...,...,...
321190,6334360,629124187_65258232847,586047816_14599436811_389185,step_2,2017-04-11 08:45:38,Test
321191,6334360,629124187_65258232847,586047816_14599436811_389185,step_1,2017-04-11 08:44:43,Test
321192,6334360,629124187_65258232847,586047816_14599436811_389185,step_2,2017-04-11 08:44:30,Test
321193,6334360,629124187_65258232847,586047816_14599436811_389185,step_1,2017-04-11 08:42:36,Test


In [33]:
# Total number of visits per variation to calculate the success rate later
number_of_visits = visits_variations.groupby('variation', observed=False)['visit_id'].nunique()
display(number_of_visits)


variation
Test       37122
Control    32181
Name: visit_id, dtype: int64

In [34]:
# sort by visit_id and date_time to see the process steps in order
visits_variations = visits_variations.sort_values(by=['visit_id', 'date_time'], ascending=[True, True])
display(visits_variations)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation
142146,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:22:17,Test
142145,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:23:09,Test
315049,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:20:56,Test
315048,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:12,Test
315047,7338123,612065484_94198474375,100019538_17884295066_43909,step_2,2017-04-09 16:21:21,Test
...,...,...,...,...,...,...
127104,6627522,730634087_44272418812,999988789_76411676596_272843,start,2017-04-21 23:49:11,Test
127103,6627522,730634087_44272418812,999988789_76411676596_272843,step_1,2017-04-21 23:49:22,Test
127102,6627522,730634087_44272418812,999988789_76411676596_272843,step_2,2017-04-21 23:50:16,Test
127101,6627522,730634087_44272418812,999988789_76411676596_272843,step_1,2017-04-21 23:51:00,Test


In [35]:
# function to filter visits with our happy path (thanks chatGPT)
def filter_visits_with_happy_path(df):
    def check_sequence(group):
        steps = list(group['process_step'])
        return steps == ['start', 'step_1', 'step_2', 'step_3', 'confirm']

    df_filtered = df.groupby('visit_id').filter(check_sequence)
    return df_filtered

happy_paths = filter_visits_with_happy_path(visits_variations)

display(happy_paths)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation
10201,2478628,754122351_18568832435,100022086_87870757897_149620,start,2017-05-23 20:44:01,Test
10200,2478628,754122351_18568832435,100022086_87870757897_149620,step_1,2017-05-23 20:44:23,Test
10199,2478628,754122351_18568832435,100022086_87870757897_149620,step_2,2017-05-23 20:45:08,Test
10198,2478628,754122351_18568832435,100022086_87870757897_149620,step_3,2017-05-23 20:46:01,Test
10197,2478628,754122351_18568832435,100022086_87870757897_149620,confirm,2017-05-23 20:47:01,Test
...,...,...,...,...,...,...
121023,4064969,110735946_42614968419,999985675_64610694964_443659,start,2017-04-20 09:45:18,Control
121022,4064969,110735946_42614968419,999985675_64610694964_443659,step_1,2017-04-20 09:46:26,Control
121021,4064969,110735946_42614968419,999985675_64610694964_443659,step_2,2017-04-20 09:47:04,Control
121020,4064969,110735946_42614968419,999985675_64610694964_443659,step_3,2017-04-20 09:48:45,Control


In [36]:
# calculate the time difference between each step within the same visit_id in seconds
happy_paths['time_taken'] = happy_paths.groupby('visit_id')['date_time'].diff().dt.total_seconds()
happy_paths['time_taken'] = happy_paths['time_taken'].fillna(0)

display(happy_paths)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation,time_taken
10201,2478628,754122351_18568832435,100022086_87870757897_149620,start,2017-05-23 20:44:01,Test,0.0
10200,2478628,754122351_18568832435,100022086_87870757897_149620,step_1,2017-05-23 20:44:23,Test,22.0
10199,2478628,754122351_18568832435,100022086_87870757897_149620,step_2,2017-05-23 20:45:08,Test,45.0
10198,2478628,754122351_18568832435,100022086_87870757897_149620,step_3,2017-05-23 20:46:01,Test,53.0
10197,2478628,754122351_18568832435,100022086_87870757897_149620,confirm,2017-05-23 20:47:01,Test,60.0
...,...,...,...,...,...,...,...
121023,4064969,110735946_42614968419,999985675_64610694964_443659,start,2017-04-20 09:45:18,Control,0.0
121022,4064969,110735946_42614968419,999985675_64610694964_443659,step_1,2017-04-20 09:46:26,Control,68.0
121021,4064969,110735946_42614968419,999985675_64610694964_443659,step_2,2017-04-20 09:47:04,Control,38.0
121020,4064969,110735946_42614968419,999985675_64610694964_443659,step_3,2017-04-20 09:48:45,Control,101.0


In [37]:
# add a column to show the total time taken for each visit_id
happy_paths['total_time_taken'] = happy_paths.groupby('visit_id')['time_taken'].transform('sum')
display(happy_paths)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation,time_taken,total_time_taken
10201,2478628,754122351_18568832435,100022086_87870757897_149620,start,2017-05-23 20:44:01,Test,0.0,180.0
10200,2478628,754122351_18568832435,100022086_87870757897_149620,step_1,2017-05-23 20:44:23,Test,22.0,180.0
10199,2478628,754122351_18568832435,100022086_87870757897_149620,step_2,2017-05-23 20:45:08,Test,45.0,180.0
10198,2478628,754122351_18568832435,100022086_87870757897_149620,step_3,2017-05-23 20:46:01,Test,53.0,180.0
10197,2478628,754122351_18568832435,100022086_87870757897_149620,confirm,2017-05-23 20:47:01,Test,60.0,180.0
...,...,...,...,...,...,...,...,...
121023,4064969,110735946_42614968419,999985675_64610694964_443659,start,2017-04-20 09:45:18,Control,0.0,1050.0
121022,4064969,110735946_42614968419,999985675_64610694964_443659,step_1,2017-04-20 09:46:26,Control,68.0,1050.0
121021,4064969,110735946_42614968419,999985675_64610694964_443659,step_2,2017-04-20 09:47:04,Control,38.0,1050.0
121020,4064969,110735946_42614968419,999985675_64610694964_443659,step_3,2017-04-20 09:48:45,Control,101.0,1050.0


In [38]:
# get the average time taken for each step
avg_time_taken = happy_paths.groupby('process_step', observed=False).agg({'time_taken':'mean'})
display(avg_time_taken)

# get the average total time taken 
avg_total_time_taken = happy_paths['total_time_taken'].mean()
display(avg_total_time_taken)

Unnamed: 0_level_0,time_taken
process_step,Unnamed: 1_level_1
start,0.0
step_1,31.447449
step_2,34.534905
step_3,93.414323
confirm,107.519375


266.91605180331254

In [39]:
# filter the happy paths without the outliers for the total time taken (1.25 times the average) 
happy_paths_filtered = happy_paths[happy_paths['total_time_taken'] <= avg_total_time_taken * 1.25]
display(happy_paths_filtered)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation,time_taken,total_time_taken
10201,2478628,754122351_18568832435,100022086_87870757897_149620,start,2017-05-23 20:44:01,Test,0.0,180.0
10200,2478628,754122351_18568832435,100022086_87870757897_149620,step_1,2017-05-23 20:44:23,Test,22.0,180.0
10199,2478628,754122351_18568832435,100022086_87870757897_149620,step_2,2017-05-23 20:45:08,Test,45.0,180.0
10198,2478628,754122351_18568832435,100022086_87870757897_149620,step_3,2017-05-23 20:46:01,Test,53.0,180.0
10197,2478628,754122351_18568832435,100022086_87870757897_149620,confirm,2017-05-23 20:47:01,Test,60.0,180.0
...,...,...,...,...,...,...,...,...
106782,829911,648229874_89449279372,999984454_18731538378_781808,start,2017-03-29 11:18:33,Test,0.0,154.0
106781,829911,648229874_89449279372,999984454_18731538378_781808,step_1,2017-03-29 11:18:51,Test,18.0,154.0
106780,829911,648229874_89449279372,999984454_18731538378_781808,step_2,2017-03-29 11:19:32,Test,41.0,154.0
106779,829911,648229874_89449279372,999984454_18731538378_781808,step_3,2017-03-29 11:20:23,Test,51.0,154.0


In [40]:
# Total number of success per variation
number_of_successes = happy_paths_filtered.groupby('variation', observed=False)['visit_id'].nunique()
display(number_of_successes)


variation
Test       8452
Control    6506
Name: visit_id, dtype: int64

In [41]:
# calculate the success rate per variation
success_rate = number_of_successes / number_of_visits
display(success_rate)

variation
Test       0.227682
Control    0.202169
Name: visit_id, dtype: float64

In [42]:
#frequency tables

## Analysis

In [46]:
# PROPORTION Z TEST ---- Proportion of successes is greater in Test group than in Control group
from statsmodels.stats.proportion import proportions_ztest
# H0: Proportion of complete steps in reasonable time in TEST group <= Proportion of complete steps in reasonable time in CONTROL group
# H1: Pt > Pc
alpha = 0.05
successes = [number_of_successes['Test'], number_of_successes['Control']]
just_visits = [number_of_visits['Test'], number_of_visits['Control']]
proportions_ztest(successes, just_visits, alternative = "larger")

(8.142009602251122, 1.9438520541870409e-16)

In [59]:
# TWO SAMPLE T TEST ---- Average total_time of success is smaller in Test group than in Control group
import scipy.stats as st
# H0: average time it took to complete steps in test group is greater or equal to one in control group 
#       (Mean_time_test>=Mean_time_control)
# H1: average time it took to complete steps is less in test group than in control group 
#       (Mean_time_test<Mean_time_control)
alpha=0.05
df_test = happy_paths_filtered[happy_paths_filtered['variation']=='Test']['total_time_taken']
df_control = happy_paths_filtered[happy_paths_filtered['variation']=='Control']['total_time_taken']
st.ttest_ind(df_test,df_control, equal_var=False, alternative = 'less')

TtestResult(statistic=-34.85801559519369, pvalue=3.182231652718646e-264, df=69122.95123661168)

In [53]:
average_time_per_variation = happy_paths_filtered.groupby('variation').agg({'total_time_taken': 'mean'})
average_time_per_variation

Unnamed: 0_level_0,total_time_taken
variation,Unnamed: 1_level_1
Test,170.967937
Control,188.676606


In [60]:
# average total time without outliers
avg_total_time = happy_paths_filtered['total_time_taken'].mean()
display(avg_total_time)

178.6703436288274

In [61]:
# function to filter out visits with start->confirm but no happy path in between them
def filter_non_happy_path_visits(df):
    def check_non_consecutive(group):
        steps = list(group['process_step'])
        if steps[0] == 'start' and steps[-1] == 'confirm':
            if steps != ['start', 'step_1', 'step_2', 'step_3', 'confirm']:
                return True
        return False

    df_filtered = df.groupby('visit_id').filter(check_non_consecutive)
    return df_filtered

non_happy_paths = filter_non_happy_path_visits(visits_variations)

display(non_happy_paths)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,variation
315049,7338123,612065484_94198474375,100019538_17884295066_43909,start,2017-04-09 16:20:56,Test
315048,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:12,Test
315047,7338123,612065484_94198474375,100019538_17884295066_43909,step_2,2017-04-09 16:21:21,Test
315046,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:35,Test
315045,7338123,612065484_94198474375,100019538_17884295066_43909,step_1,2017-04-09 16:21:41,Test
...,...,...,...,...,...,...
278943,4449968,842902495_57580498240,999976049_95772503197_182554,step_1,2017-04-04 12:52:26,Test
278942,4449968,842902495_57580498240,999976049_95772503197_182554,step_2,2017-04-04 12:53:01,Test
278941,4449968,842902495_57580498240,999976049_95772503197_182554,step_3,2017-04-04 12:54:54,Test
278940,4449968,842902495_57580498240,999976049_95772503197_182554,step_3,2017-04-04 12:59:35,Test


In [62]:
number_of_visits = visits_variations.groupby('variation', observed=False)['visit_id'].nunique()
display(number_of_visits)

variation
Test       37122
Control    32181
Name: visit_id, dtype: int64

In [63]:
number_of_visits_non_happy = non_happy_paths.groupby('variation', observed=False)['visit_id'].nunique()
display(number_of_visits_non_happy)

variation
Test       7133
Control    5233
Name: visit_id, dtype: int64

In [64]:
success_rate_non_happy = number_of_visits_non_happy / number_of_visits
display(success_rate_non_happy)
# proportion of lost people is greater in test group.

variation
Test       0.192150
Control    0.162611
Name: visit_id, dtype: float64

In [44]:
#TODO: consider binning / pd.cut / qcut for numerical data
#TODO: correlation matrix
#TODO: tukeys_test_outliers

In [45]:
# check back and forth between steps, lost?
# 

## Visualizations

## Conclusions