# VANGUARD AB TEST


## METADATA HELP

This comprehensive set of fields will guide your analysis, helping you unravel the intricacies of client behavior and preferences.

- **client_id**: Every client’s unique ID.
- **variation**: Indicates if a client was part of the experiment.
- **visitor_id**: A unique ID for each client-device combination.
- **visit_id**: A unique ID for each web visit/session.
- **process_step**: Marks each step in the digital process.
- **date_time**: Timestamp of each web activity.
- **clnt_tenure_yr**: Represents how long the client has been with Vanguard, measured in years.
- **clnt_tenure_mnth**: Further breaks down the client’s tenure with Vanguard in months.
- **clnt_age**: Indicates the age of the client.
- **gendr**: Specifies the client’s gender.
- **num_accts**: Denotes the number of accounts the client holds with Vanguard.
- **bal**: Gives the total balance spread across all accounts for a particular client.
- **calls_6_mnth**: Records the number of times the client reached out over a call in the past six months.
- **logons_6_mnth**: Reflects the frequency with which the client logged onto Vanguard’s platform over the last six months.


In [None]:
%load_ext autoreload
%autoreload 2 

In [1009]:
from cleaning import *
from mining import *
from db_handling import *
from analysis import *
import pandas as pd
from dotenv import load_dotenv
import os
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.proportion import proportions_ztest

In [None]:
# Load environment variables
load_dotenv()

### Load Configuration

In [None]:
# Load config.yaml
config = parse_config()

## Data Mining

In [1012]:
# Creates a dictionary of all imported dataframes
dataframes = { name:import_data_from_config(config, name) for name in config['tables']}

## Data Cleaning

In [1013]:
#TODO: don't impose categories?

In [1014]:
# Rename columns
dataframes = rename_columns(dataframes, config)

In [1015]:
# Select columns
dataframes = select_columns(dataframes, config)

In [None]:
display_dataFrames(dataframes,'head','shape','describe')

### Separation

In [None]:
client_df = dataframes['clients']
experiment_df = dataframes['experiment']
visits_df = dataframes['visits']
display (client_df, experiment_df, visits_df)

In [None]:
display(visits_df.describe(include='all'))

In [None]:
# drop the nulls from clients, but keep the list of the drops

nulls_client_id = client_df[client_df.isna().any(axis=1)]['client_id']
nulls_client_id

In [None]:
client_df = client_df.dropna(axis=0)
client_df

In [None]:
display(experiment_df['variation'].value_counts(dropna = False))
# keep NaN for general analysis of clients, but drop them from everywhere for test analysis

In [None]:
# client_df, experiment_df, visit_df -> for general analysis
# new_client_df, new_experiment_df, new_visit_dfn -> for test/control analysis   experiment_df_null = 
nulls_in_experiment = experiment_df[experiment_df.isna().any(axis=1)]['client_id']
nulls_in_experiment

In [None]:
# new df removing client ID that are null in experiment

display(experiment_df.count())
new_experiment_df = experiment_df[~experiment_df['client_id'].isin(nulls_in_experiment)]
display(new_experiment_df.count())

new_experiment_df = new_experiment_df[~new_experiment_df['client_id'].isin(nulls_client_id)]
display(new_experiment_df.count())
display(new_experiment_df.isna().sum())



In [None]:
display(visits_df)
new_visits_df = visits_df[~visits_df['client_id'].isin(nulls_in_experiment)]
display(new_visits_df)
display(new_visits_df.isna().sum())

In [None]:
display(client_df)
new_client_df = client_df[~client_df['client_id'].isin(nulls_in_experiment)]
display(new_client_df)

### End separation

In [1026]:
dataframes['clients'] = new_client_df.copy()
dataframes['experiment'] = new_experiment_df.copy()
dataframes['visits'] = new_visits_df.copy()

In [1027]:
# Data Categorizing
dataframes = clean_categorical_data(dataframes, config)

In [1028]:
#Convert types
dataframes = convert_types(dataframes, config)

In [None]:
display_dataFrames(dataframes, 'head', 'shape', 'cat_count')

In [1030]:
client_df = dataframes['clients']
experiment_df = dataframes['experiment']
visits_df = dataframes['visits']

In [None]:
display(client_df.describe(include='all'))
display(client_df.dtypes)

### SQL EXPORT

In [1032]:
if config['refresh_db']:

    db_password = os.getenv('SQL_PASSWORD')

    # Create database if it doesn't exist
    engine = create_db(db_password, config)

    # Export tables to database if refresh is set to true
    export_dataframes_to_sql(engine, dataframes)

    # Import data from database
    dataframes = import_all_tables_from_sql(engine)

### Local Caching

In [None]:
""" # Save files locally in an untracked folder
export_dataframes_to_csv(dataframes) """

In [None]:
#TODO CAREFUL DATA WONT BE PROPERLY CATEGORIZED / TYPED run after : convert_types(dataframes, config)
""" clients_df = pd.read_csv('data/cleaned/clients.csv')
experiment_df = pd.read_csv('data/cleaned/experiment.csv')
visits_df = pd.read_csv('data/cleaned/visits.csv') """

## CLEAN FRAMES

In [None]:
display('clients :',client_df, 'experiment :',experiment_df, 'visits :',visits_df)
display('clients :',client_df.describe(include='all'), 'experiment :',experiment_df.describe(include='all'), 'visits :',visits_df.describe(include='all'))
display('clients :',client_df.dtypes, 'experiment :',experiment_df.dtypes, 'visits :',visits_df.dtypes)

experiment_df['variation'].value_counts()


## Data Exploration

### Merge frames

In [None]:
# Added variation column to visits for easier analysis
visits_variations = visits_df.merge(experiment_df, on='client_id', how='inner')
display(visits_variations)
display(visits_variations.dtypes)

In [None]:
# Merge clients and visits for demographic analysis
visits_variations = visits_variations.merge(client_df, on='client_id', how='inner')
display(visits_variations)
display(visits_variations.dtypes)


### Sort visits by time

In [None]:
# sort by visit_id and date_time to see the process steps in order
visits_variations = visits_variations.sort_values(by=['visit_id', 'date_time'], ascending=[True, True])
display(visits_variations)


### Remove repeat and shared visits


In [None]:
first_visit = visits_variations.drop_duplicates(subset='client_id', keep='first')
first_visit = first_visit.drop_duplicates(subset='visit_id', keep=False)
selected_visits = first_visit['visit_id']
display(first_visit)

visits_variations = visits_variations[visits_variations['visit_id'].isin(selected_visits)]
display(visits_variations)

### Remove non-starters

In [None]:
first_visit = visits_variations.drop_duplicates(subset='client_id', keep='first')
non_starters =  first_visit[first_visit['process_step'] != 'start']['client_id']
visits_variations = visits_variations[~visits_variations['client_id'].isin(non_starters)]

display(visits_variations)

## OUTLIERS

### Time taken

In [None]:
visits_variations['time_taken'] = visits_variations.groupby('visit_id')['date_time'].diff().dt.total_seconds()
visits_variations['time_taken'] = visits_variations['time_taken'].fillna(0)

display(visits_variations)

In [None]:
# add a column to show the total time taken for each visit_id
visits_variations['total_time_taken'] = visits_variations.groupby('visit_id')['time_taken'].transform('sum')
display(visits_variations)

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='variation', y='total_time_taken', data=visits_variations)
plt.xlabel('Variation')
plt.ylabel('Total Time Taken (seconds)')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=visits_variations, x='total_time_taken', kde=True, bins=300)
plt.xlabel('Total Time Taken (seconds)')
plt.ylabel('Frequency')
plt.show()

In [None]:
variations_outliers = tukeys_test_outliers(visits_variations['total_time_taken'], method="show")
display(variations_outliers)
display(variations_outliers.describe())

visits_variations = visits_variations[~visits_variations['total_time_taken'].isin(variations_outliers)]
display(visits_variations)
display(visits_variations.describe(include='all'))
display(visits_variations.dtypes)


In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=visits_variations, x='total_time_taken', kde=True, bins=50)
plt.xlabel('Total Time Taken (seconds)')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='variation', y='total_time_taken', data=visits_variations)
plt.xlabel('Variation')
plt.ylabel('Total Time Taken (seconds)')
plt.show()

visits_variations['total_time_taken'].describe()

### Correlation matrix

In [None]:
# select all clients
visits_by_client_ids = visits_variations.drop_duplicates(subset='client_id', keep='first')
display(visits_by_client_ids)
# Select numerical columns
numerical_data = visits_by_client_ids[['client_age', 'total_time_taken', 'client_since_month','number_of_accounts','balance','calls_6_months','logons_6_month']]

# Compute the correlation matrix
correlation_matrix = numerical_data.corr()

# Display the correlation matrix
display(correlation_matrix)

# Plot the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='Spectral_r', fmt='.2f')
# Set transparent background
plt.gcf().set_facecolor('none')  # Make the figure face color transparent
plt.gca().patch.set_alpha(0)     # Make the axes patch transparent

plt.show()

### Count visits

In [None]:
# Total number of visits per variation to calculate the success rate later
number_of_visits = visits_variations.groupby('variation', observed=False)['visit_id'].nunique()
display(number_of_visits)


### Time taken

In [None]:
# Average total time taken per variation
average_total_time_ = visits_variations.groupby('variation', observed=False)['total_time_taken'].mean()
display(average_total_time_)

## Happy Path

In [1051]:
# function to filter visits with our happy path
def filter_visits_with_happy_path(df):
    def check_sequence(group):
        steps = list(group['process_step'])
        return steps == ['start', 'step_1', 'step_2', 'step_3', 'confirm']

    df_filtered = df.groupby('visit_id').filter(check_sequence)
    return df_filtered


In [None]:
happy_paths = filter_visits_with_happy_path(visits_variations)
display(happy_paths)

In [None]:
# Total number of success per variation
number_of_successes = happy_paths.groupby('variation', observed=False)['visit_id'].nunique()
display(number_of_successes)

In [None]:
# calculate the success rate per variation
success_rate = number_of_successes / number_of_visits
display(success_rate)

## Confused Path

In [1055]:
# function to filter out visits with start->confirm but no happy path in between them
def filter_non_happy_path_visits(df):
    def check_non_consecutive(group):
        steps = list(group['process_step'])
        if steps[0] == 'start' and steps[-1] == 'confirm':
            if steps != ['start', 'step_1', 'step_2', 'step_3', 'confirm']:
                return True
        return False

    df_filtered = df.groupby('visit_id').filter(check_non_consecutive)
    return df_filtered

In [None]:
confused_paths = filter_non_happy_path_visits(visits_variations)
display(confused_paths)

In [None]:
number_of_visits_non_happy = confused_paths.groupby('variation', observed=False)['visit_id'].nunique()
display(number_of_visits_non_happy)

In [None]:
success_rate_non_happy = number_of_visits_non_happy / number_of_visits
display(success_rate_non_happy)
# proportion of lost people is greater in test group.

## Dropped Path

In [None]:
without_happy_path = visits_variations[~visits_variations['visit_id'].isin(happy_paths['visit_id'])]

dropped_paths = without_happy_path[~without_happy_path['visit_id'].isin(confused_paths['visit_id'])]
display(dropped_paths)

In [None]:
number_of_visits_error_path = dropped_paths.groupby('variation', observed=False)['visit_id'].nunique()
display(number_of_visits_error_path)

In [None]:
success_rate_error_paths = number_of_visits_error_path / number_of_visits
display(success_rate_error_paths)

## EXPORT DATAFRAME FOR VISUALISATION

### Select unique ids

In [None]:
unique_clients_visits = visits_variations.drop_duplicates(subset='client_id')
display(unique_clients_visits)

### Add path column

In [None]:
# Add 'path' column to unique_clients_visits
unique_clients_visits.loc[:, 'path'] = 'dropped' 

# Update 'path' column for happy paths
unique_clients_visits.loc[unique_clients_visits['visit_id'].isin(happy_paths['visit_id']), 'path'] = 'happy'

# Update 'path' column for confused paths
unique_clients_visits.loc[unique_clients_visits['visit_id'].isin(confused_paths['visit_id']), 'path'] = 'confused'

display(unique_clients_visits)


### Add helper columns

In [None]:
# Add a column to separate ages into quantile-based groups with similar sizes
unique_clients_visits['age_group'] = pd.qcut(unique_clients_visits['client_age'], q=3, labels=['Younger', 'Middle aged', 'Seniors'])
display(unique_clients_visits.groupby('age_group', observed=False)['client_age'].describe())

# Add a column to separate balance into quantile-based groups with similar sizes
unique_clients_visits['balance_group'] = pd.qcut(unique_clients_visits['balance'], q=3, labels=['Low Balance', 'Medium Balance', 'High Balance'])
display(unique_clients_visits.groupby('balance_group', observed=False)['balance'].describe())

# Add a column 'is_active' if the client called more than thrice or logged on more than six times
unique_clients_visits['is_active'] = (unique_clients_visits['calls_6_months'] >= 6) | (unique_clients_visits['logons_6_month'] >= 6)
display(unique_clients_visits['is_active'].value_counts())

### Export csv

In [1065]:
#export to csv
unique_clients_visits.to_csv('data/unique_clients_visits.csv', index=False)

## Analysis

In [None]:
# PROPORTION Z TEST ----Happy Paths---- Proportion of successes is greater in Test group than in Control group
# H0: Proportion of complete steps in TEST group <= Proportion of complete steps in CONTROL group
# H1: Pt > Pc
alpha = 0.05

successes = [number_of_successes['Test'], number_of_successes['Control']]
just_visits = [number_of_visits['Test'], number_of_visits['Control']]

happy_results = proportions_ztest(successes, just_visits, alternative = "larger")
display(happy_results[0],happy_results[1])


In [None]:
# PROPORTION Z TEST ----Non Happy Paths----
# H0: Proportion of complete steps in TEST group <= Proportion of complete steps in CONTROL group
# H1: Pt > Pc
alpha = 0.05

successes = [number_of_visits_non_happy['Test'], number_of_visits_non_happy['Control']]
just_visits = [number_of_visits['Test'], number_of_visits['Control']]

confused_results = proportions_ztest(successes, just_visits, alternative = "larger")
display(confused_results[0],confused_results[1])


In [None]:
# PROPORTION Z TEST ----Error Paths---- 
# H0: Proportion of visits in TEST group >= Proportion of visits in CONTROL group
# H1: Pt < Pc
alpha = 0.05

successes = [number_of_visits_error_path['Test'], number_of_visits_error_path['Control']]
just_visits = [number_of_visits['Test'], number_of_visits['Control']]

error_results = proportions_ztest(successes, just_visits, alternative = "smaller")
display(error_results[0],error_results[1])


### T-Test for total time

In [None]:
# TWO SAMPLE T TEST ---- Average total_time of success is smaller in Test group than in Control group
import scipy.stats as st
# H0: average time it took to complete steps in test group is greater or equal to one in control group 
#       (Mean_time_test>=Mean_time_control)
# H1: average time it took to complete steps is less in test group than in control group 
#       (Mean_time_test<Mean_time_control)
alpha=0.05
df_test = happy_paths[happy_paths['variation']=='Test']['total_time_taken']
df_control = happy_paths[happy_paths['variation']=='Control']['total_time_taken']
st.ttest_ind(df_test,df_control, equal_var=False, alternative = 'less')

In [None]:
average_time_per_variation = happy_paths.groupby('variation').agg({'total_time_taken': 'mean'})
average_time_per_variation

In [None]:
# average total time without outliers
avg_total_time = happy_paths['total_time_taken'].mean()
display(avg_total_time)

In [None]:
# TWO SAMPLE T TEST ---- Average total_time of success is smaller in Test group than in Control group
import scipy.stats as st
# H0: average time it took to complete steps in test group is greater or equal to one in control group 
#       (Mean_time_test>=Mean_time_control)
# H1: average time it took to complete steps is less in test group than in control group 
#       (Mean_time_test<Mean_time_control)
alpha=0.05
df_test = confused_paths[confused_paths['variation']=='Test']['total_time_taken']
df_control = confused_paths[confused_paths['variation']=='Control']['total_time_taken']
st.ttest_ind(df_test,df_control, equal_var=False, alternative = 'less')

In [None]:
average_time_per_variation = confused_paths.groupby('variation').agg({'total_time_taken': 'mean'})
average_time_per_variation

In [None]:
# average total time without outliers
avg_total_time = confused_paths['total_time_taken'].mean()
display(avg_total_time)

In [1075]:
#TODO: funnel analysis
#TODO: stats on last step
#TODO: analyze repeat visits?

In [1076]:
# STORY:

# 1. The data had the journeys of 108216 clients in 130607 visits.
    # We selected 

## Visualizations

## Conclusions

## Q

In [1077]:
# hypothesis testing interpretation
# python debugging
# config file
# SQL to tableau
# boxplot not all outliers