In [None]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))

import pandas as pd
import numpy as np
import math
import itertools
import re
import json

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from src import analysis
from src import framework
from src import ab_tests
from src.framework import print_title

In [None]:
import importlib
importlib.reload(analysis)
importlib.reload(framework)
importlib.reload(ab_tests)

# EXPERIMENT DESIGN

#### SOURCES INGESTION

In [None]:
# Load raw data
file_name = ''
df_raw = pd.read_csv('../data/' + file_name)
display(df_raw.head(5))

# Make a copy
df = df_raw.copy()

# Check dtypes
print_title('INITIAL DATA TYPES')
print(df.dtypes)

#### METRIC SELECTION

In [None]:
# calculate the metrics, use metric_cnt_ or metric_cvr_
df['metric_cnt_...'] = df['...']
df['metric_cvr...'] = df['...'].apply(lambda x: 1 if x > 0 else 0, 1)

#### EXPERIMENT DESIGN

In [None]:
sample_data_df = df.copy()
np.random.seed(42)

# Experiment Parameters
effect_size = 0.1  # MDE relative percentage (0.02 = 2%)
daily_traffic = 10000  # Daily users
traffic_proportion = 0.5 # Proportion of daily users use for the experiment
metric_column = '...'
is_sample_data_available = True

# Experiment sizes
if not is_sample_data_available:
    sample_size = analysis.calculate_sample_size_cohen(effect_size, alpha=0.05, power=0.8)
else:
    sample_size = analysis.calculate_sample_size_var(sample_data_df, metric_column, effect_size, alpha=0.05, power=0.8)

# Experiment duration
experiment_days_duration = analysis.calculate_experiment_duration(sample_size, daily_traffic, traffic_proportion)

print(f"Sample size: {sample_size} users per variant")
print(f"Minimum days required: {experiment_days_duration}")

-----------------------------------------------------------------------------------------

# EXPERIMENT RESULTS

## SOURCES INGESTION

### LOAD

In [None]:
# Load raw data
file_name = '....csv'
df_raw = pd.read_csv('../data/' + file_name)
display(df_raw.head(5))

# Make a copy
df = df_raw.copy()

# Check dtypes
print_title('INITIAL DATA TYPES')
print(df.dtypes)

### DTYPES

In [None]:
datetime_columns = ['...', '...', '...']
int64_columns = ['...', '...', '...']
float64_columns = ['...', '...', '...']
str_columns = ['...', '...', '...']

# Basic data conversion
df = framework.format_columns(df, datetime_columns, int64_columns, float64_columns, str_columns)

# Check dtypes
print_title('CONVERTED DATA TYPES')
print(df.dtypes)
display(df.head(5))

## DATA CLEANING

### CHECKS

In [None]:
variant_column = '...'

In [None]:
# Quick checks on data
print_title('DF INFO')
display(df.info())

print_title('DF DESCRIBE')
display(df.describe())

# Check distribution of variants
print_title('VARIANT DISTRIBUTION')
display(df[variant_column].value_counts())

### DUPLICATES

In [None]:
# Check for and drop duplicates in the entire DataFrame
duplicated_rows = df.duplicated().sum()
print('# of duplicated rows: ', duplicated_rows)

if duplicated_rows > 0:
    df = df.drop_duplicates()
    print('Duplicates in the DataFrame removed.')
else:
    print('No duplicates in the DataFrame found.')

In [None]:
primary_key_column = '...'
timestamp_column = ''

# Check for duplicates in the unique columns
duplicated_rows = df[df[primary_key_column].duplicated(keep=False)]
print(f'# of duplicated on {primary_key_column} column: {duplicated_rows[primary_key_column].nunique()}')

if not duplicated_rows.empty:
    print(f'Duplicated {primary_key_column} and their rows:')
    display(duplicated_rows.sort_values(by = primary_key_column))

    # Keep only the first following timestamp column order
    if timestamp_column == '':
        df = df.drop_duplicates(subset=primary_key_column, keep='last')
        print('Kept the most recent row for each duplicated' +  primary_key_column)
    else:
        df = df.sort_values(timestamp_column).drop_duplicates(subset=primary_key_column, keep='last')
        print('Kept the most recent row for each duplicated ' + primary_key_column)

### NULLS

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print_title('NUMBER OF NULL VALUES')
print(missing_values)

In [None]:
#Â Fill null columns
df['...'] = df['...'].fillna(0)

## METRICS DEFINITION

In [None]:
# calculate the metrics, use metric_cnt_ or metric_cvr_
df['metric_cnt_...'] = df['...']
df['metric_cvr...'] = df['...'].apply(lambda x: 1 if x > 0 else 0, 1)

## DATA ANALYSIS

In [None]:
# Select metrics and columns involved in the test
primary_key_column = '...'
metric_column = '...'
variant_column = '...'

columns_selection_df = df[[primary_key_column, variant_column, metric_column]]
metric_type = (
    'continuous' if metric_column.startswith('metric_cnt_') else
    'proportion' if metric_column.startswith('metric_cvr_') else
    None
)
outliers_filtered_df, is_strong_outlier_effect = analysis.remove_outliers(columns_selection_df, metric_column, 1)

# Filter outliers:
filter_outliers = False

if filter_outliers:
    selected_df = outliers_filtered_df.copy()
else:
    selected_df = columns_selection_df.copy()

display(selected_df.sample(5))

### NUMBER OF VARIANTS

In [None]:
num_variants = selected_df[variant_column].nunique()

print(f"Number of Variants: {num_variants}")
print(f"Variants: {selected_df[variant_column].unique()}")

### SAMPLE SIZES

In [None]:
# check if the sample size is large enough
# check if the variant sizes ar equal or not and the proportion

sample_sizes = selected_df[variant_column].value_counts()
print("Sample sizes per variant:")
print(sample_sizes)

variant_proportion = sample_sizes / sample_sizes.sum()
print("\nProportion per variant:")
print(variant_proportion)

variant_ratio = sample_sizes.max() / sample_sizes.min()
print(f"\nVariant Ratio (N = max/min): {variant_ratio:.2f}")

### SAMPLE DISTRIBUTION

In [None]:
sns.set_style("white")
print_title('NORMAL DISTRIBUTION VISUAL ANALYSIS', 120)
analysis.plot_qq(selected_df, variant_column, metric_column)
analysis.plot_histogram_kde(selected_df, variant_column, metric_column)
analysis.plot_violin(selected_df, variant_column, metric_column)
analysis.plot_combined_kde(selected_df, variant_column, metric_column)

print_title('NORMAL DISTRIBUTION TEST RESULTS', 120)
distribution_results = analysis.calculate_distribution(selected_df, variant_column, metric_column)

is_normal_distribution = analysis.set_normal_distribution_flag(distribution_results, alpha=0.05)
print(f'\nUSE NORMAL DISTRIBUTION TESTS: {is_normal_distribution}')

### SAMPLE VARIANCES

In [None]:
print_title('VARIANCE TEST RESULTS', 180)
variance_results = analysis.calculate_variance_analysis(selected_df, variant_column, metric_column)

is_equal_variance = analysis.set_equal_variance_flag(variance_results, alpha=0.05)
print(f'\nUSE EQUAL VARIANCE TESTS: {is_equal_variance}')

## STATISTICAL TESTING

### TEST AND TECHNICHES SELECTION

In [None]:
print_title('TEST VARIABLES', 60)
ab_test_config = ab_tests.configure_ab_test(metric_type, is_equal_variance, is_normal_distribution, num_variants, variant_ratio, sample_sizes, is_strong_outlier_effect)
print('\n')
print_title('TEST SELECTION', 60)
print({key: value for key, value in ab_test_config.items() if value})

### UNBALANCE DATA

In [None]:
selected_df = ab_tests.resample_data(selected_df, ab_test_config, variant_column)

### TESTS

In [None]:
print_title('TEST RESULTS', 80)
standardized_results = ab_tests.run_complete_ab_test(ab_test_config, selected_df, variant_column, metric_column, num_variants, alpha = 0.05)
display(standardized_results)

print_title('TEST SELECTION', 80)
print({key: value for key, value in ab_test_config.items() if value})

framework.plot_distributions(selected_df, variant_column, metric_column, 0.05)

### ADDITIONAL TECHNIQUES

In [None]:
additional_tests_results = ab_tests.apply_additional_tests(ab_test_config, selected_df, variant_column, metric_column)
display(additional_tests_results)