# VANGUARD AB TEST


## METADATA HELP

This comprehensive set of fields will guide your analysis, helping you unravel the intricacies of client behavior and preferences.

- **client_id**: Every client’s unique ID.
- **variation**: Indicates if a client was part of the experiment.
- **visitor_id**: A unique ID for each client-device combination.
- **visit_id**: A unique ID for each web visit/session.
- **process_step**: Marks each step in the digital process.
- **date_time**: Timestamp of each web activity.
- **clnt_tenure_yr**: Represents how long the client has been with Vanguard, measured in years.
- **clnt_tenure_mnth**: Further breaks down the client’s tenure with Vanguard in months.
- **clnt_age**: Indicates the age of the client.
- **gendr**: Specifies the client’s gender.
- **num_accts**: Denotes the number of accounts the client holds with Vanguard.
- **bal**: Gives the total balance spread across all accounts for a particular client.
- **calls_6_mnth**: Records the number of times the client reached out over a call in the past six months.
- **logons_6_mnth**: Reflects the frequency with which the client logged onto Vanguard’s platform over the last six months.


In [123]:
%load_ext autoreload
%autoreload 2 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [124]:
import pandas as pd
from cleaning import *
from mining import *
from dotenv import load_dotenv
import os
import yaml

In [125]:
# Load environment variables
load_dotenv()

True

In [126]:
# Load config
config = parse_config()

{'tables': {'clients': {'paths': ['data/df_final_demo.txt'], 'separator': ',', 'columns': {'client_id': {'original_name': 'client_id', 'data_type': 'INTEGER', 'primary_key': True, 'pandas_dtype': 'int64'}, 'client_since_year': {'original_name': 'clnt_tenure_yr', 'data_type': 'FLOAT', 'pandas_dtype': 'int64'}, 'client_since_month': {'original_name': 'clnt_tenure_mnth', 'data_type': 'FLOAT', 'pandas_dtype': 'int64'}, 'client_age': {'original_name': 'clnt_age', 'data_type': 'FLOAT', 'pandas_dtype': 'int64'}, 'gender': {'original_name': 'gendr', 'data_type': 'CHAR(1)', 'pandas_dtype': 'category', 'valid_categories': ['U', 'M', 'F'], 'fallback_category': 'U'}, 'number_of_accounts': {'original_name': 'num_accts', 'data_type': 'FLOAT', 'pandas_dtype': 'int64'}, 'balance': {'original_name': 'bal', 'data_type': 'FLOAT', 'pandas_dtype': 'float64'}, 'calls_6_months': {'original_name': 'calls_6_mnth', 'data_type': 'FLOAT', 'pandas_dtype': 'int64'}, 'logons_6_month': {'original_name': 'logons_6_mnt

## Data Mining

In [127]:
#TODO: adapt function to remote url + save to sql to prevent large repo

In [128]:
dataFrames = { name:import_data_from_config(config, name) for name in config['tables']}

In [129]:
display_dataFrames(dataFrames)

'clients'

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,836976,6.0,73.0,60.5,U,2.0,45105.30,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.30,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0
...,...,...,...,...,...,...,...,...,...
70604,7993686,4.0,56.0,38.5,U,3.0,1411062.68,5.0,5.0
70605,8981690,12.0,148.0,31.0,M,2.0,101867.07,6.0,6.0
70606,333913,16.0,198.0,61.5,F,2.0,40745.00,3.0,3.0
70607,1573142,21.0,255.0,68.0,M,3.0,475114.69,4.0,4.0


'experiment'

Unnamed: 0,client_id,Variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control
...,...,...
70604,2443347,
70605,8788427,
70606,266828,
70607,1266421,


'visits'

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04
...,...,...,...,...,...
755400,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:46:10
755401,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:45:29
755402,9668240,388766751_9038881013,922267647_3096648104_968866,step_1,2017-05-24 18:44:51
755403,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:44:34


## Data Cleaning

In [130]:
#TODO: don't impose categories?
#TODO: consider binning / pd.cut / qcut for numerical data

In [131]:
# Rename columns
dataFrames = rename_columns(dataFrames, config)

In [132]:
# select columns
dataFrames = select_columns(dataFrames, config)
display_dataFrames(dataFrames)

'clients'

Unnamed: 0,client_id,client_since_year,client_since_month,client_age,gender,number_of_accounts,balance,calls_6_months,logons_6_month
0,836976,6.0,73.0,60.5,U,2.0,45105.30,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.30,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0
...,...,...,...,...,...,...,...,...,...
70604,7993686,4.0,56.0,38.5,U,3.0,1411062.68,5.0,5.0
70605,8981690,12.0,148.0,31.0,M,2.0,101867.07,6.0,6.0
70606,333913,16.0,198.0,61.5,F,2.0,40745.00,3.0,3.0
70607,1573142,21.0,255.0,68.0,M,3.0,475114.69,4.0,4.0


'experiment'

Unnamed: 0,client_id,variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control
...,...,...
70604,2443347,
70605,8788427,
70606,266828,
70607,1266421,


'visits'

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04
...,...,...,...,...,...
755400,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:46:10
755401,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:45:29
755402,9668240,388766751_9038881013,922267647_3096648104_968866,step_1,2017-05-24 18:44:51
755403,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:44:34


In [133]:
# Data Categorizing
def clean_categorical_data(dataFrames, config):
    #TODO : default values for categories if no valid_categories
    for table in config['tables']:
        for column in config['tables'][table]['columns']:

            column_config = config['tables'][table]['columns'][column]
            valid_categories = column_config.get('valid_categories')

            if valid_categories:
                dataFrames[table][column] = dataFrames[table][column].astype('category')
                dataFrames[table][column] = dataFrames[table][column].cat.set_categories(valid_categories)

                fallback = column_config.get('fallback_category')
                if fallback:
                    dataFrames[table][column] = dataFrames[table][column].fillna(fallback)
                else:
                    dataFrames[table][column] = dataFrames[table][column].fillna('unknown')
    return dataFrames


dataFrames = clean_categorical_data(dataFrames, config)

display_dataFrames(dataFrames)
display(dataFrames['experiment'].isna().sum())

TypeError: Cannot setitem on a Categorical with a new category (unknown), set the categories first

In [85]:
# Data formatting

In [86]:
# Data Typing

In [87]:
# Handle duplicates

In [88]:
# Handle missing values

## Data Exploration

In [89]:
# Handle outliers

In [90]:
#frequency tables

## Analysis

## Visualizations

## Conclusions