In [None]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('..')))

import pandas as pd
import numpy as np
import math
import itertools
import re
import json

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from src import analysis
from src import framework
from src import ab_tests
from src.framework import print_title

In [None]:
import importlib
importlib.reload(analysis)
importlib.reload(framework)
importlib.reload(ab_tests)

# EXPERIMENT RESULTS

## SOURCES INGESTION

### LOAD

In [None]:
# Load raw data
file_name = '.csv'
df_raw = pd.read_csv('../data/' + file_name)
display(df_raw.head(5))

# Make a copy
df = df_raw.copy()

# Check dtypes
print_title('INITIAL DATA TYPES')
print(df.dtypes)

### DTYPES

In [None]:
datetime_columns = ['...', '...', '...']
int64_columns = ['...', '...', '...']
float64_columns = ['...', '...', '...']
str_columns = ['...', '...', '...']

# Basic data conversion
df = framework.format_columns(df, datetime_columns, int64_columns, float64_columns, str_columns)

# Check dtypes
print_title('CONVERTED DATA TYPES')
print(df.dtypes)
display(df.head(5))

## DATA CLEANING

### CHECKS

In [None]:
variant_column = '...'

In [None]:
# Quick checks on data
print_title('DF INFO')
display(df.info())

print_title('DF DESCRIBE')
display(df.describe())

# Check distribution of variants
print_title('VARIANT DISTRIBUTION')
display(df[variant_column].value_counts())

### DUPLICATES

In [None]:
# Check for and drop duplicates in the entire DataFrame
duplicated_rows = df.duplicated().sum()
print('# of duplicated rows: ', duplicated_rows)

if duplicated_rows > 0:
    df = df.drop_duplicates()
    print('Duplicates in the DataFrame removed.')
else:
    print('No duplicates in the DataFrame found.')

In [None]:
primary_key_column = '...'
timestamp_column = ''

# Check for duplicates in the unique columns
duplicated_rows = df[df[primary_key_column].duplicated(keep=False)]
print(f'# of duplicated on {primary_key_column} column: {duplicated_rows[primary_key_column].nunique()}')

if not duplicated_rows.empty:
    print(f'Duplicated {primary_key_column} and their rows:')
    display(duplicated_rows.sort_values(by = primary_key_column))

    # Keep only the first following timestamp column order
    if timestamp_column == '':
        df = df.drop_duplicates(subset=primary_key_column, keep='last')
        print('Kept the most recent row for each duplicated' +  primary_key_column)
    else:
        df = df.sort_values(timestamp_column).drop_duplicates(subset=primary_key_column, keep='last')
        print('Kept the most recent row for each duplicated ' + primary_key_column)

### NULLS

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print_title('NUMBER OF NULL VALUES')
print(missing_values)

In [None]:
# Fill null columns
df['...'] = df['...'].fillna(0)

## METRICS DEFINITION

In [None]:
# calculate the metrics, use metric_cnt_ or metric_cvr_
df['metric_cnt_...'] = df['...']
df['metric_cvr...'] = df['...'].apply(lambda x: 1 if x > 0 else 0, 1)

## SEGMENT ANALYSIS

### METRICS AND CONFIG SELECTION

In [None]:
# Select metrics and columns involved in the test
primary_key_column = '...'
metric_column = '...'
variant_column = '...'
segment_column = '...'

columns_selection_df = df[[primary_key_column, variant_column, segment_column, metric_column]]
metric_type = (
    'continuous' if metric_column.startswith('metric_cnt_') else
    'proportion' if metric_column.startswith('metric_cvr_') else
    None
)
outliers_filtered_df, is_strong_outlier_effect = analysis.remove_outliers(columns_selection_df, metric_column, 1)

# Filter outliers:
filter_outliers = False

if filter_outliers:
    selected_df = outliers_filtered_df.copy()
else:
    selected_df = columns_selection_df.copy()

display(selected_df.sample(5))

### DATA ANALYSIS AND STATISTICAL TESTING BY SEGMENTS

In [None]:
if True:
    selected_df = framework.add_segment_column(selected_df, num_segments=4)
display(selected_df)

In [None]:
segments = selected_df[segment_column].unique()

for segment in segments:
    print_title('SEGMENT: ' + str(segment), 160, '=')
    print_title('DATA ANALYSIS', 130, ':')
    segment_df = selected_df[selected_df[segment_column] == segment]

    ### NUMBER OF VARIANTES
    num_variants = segment_df[variant_column].nunique()

    print(f"Number of Variants: {num_variants}")
    print(f"Variants: {segment_df[variant_column].unique()}")

    ### SAMPLE SIZES
    sample_sizes = segment_df[variant_column].value_counts()
    print("Sample sizes per variant:")
    print(sample_sizes)

    variant_proportion = sample_sizes / sample_sizes.sum()
    print("\nProportion per variant:")
    print(variant_proportion)

    variant_ratio = sample_sizes.max() / sample_sizes.min()
    print(f"\nVariant Ratio (N = max/min): {variant_ratio:.2f}")

    ### SAMPLE DISTRIBUTION
    sns.set_style("white")
    print_title('NORMAL DISTRIBUTION VISUAL ANALYSIS', 100)
    analysis.plot_qq(segment_df, variant_column, metric_column)
    analysis.plot_histogram_kde(segment_df, variant_column, metric_column)
    analysis.plot_violin(segment_df, variant_column, metric_column)
    analysis.plot_combined_kde(segment_df, variant_column, metric_column)

    print_title('NORMAL DISTRIBUTION TEST RESULTS', 100)
    distribution_results = analysis.calculate_distribution(segment_df, variant_column, metric_column)

    is_normal_distribution = analysis.set_normal_distribution_flag(distribution_results, alpha=0.05)
    print(f'\nUSE NORMAL DISTRIBUTION TESTS: {is_normal_distribution}')

    ### SAMPLE VARIANCES
    print_title('VARIANCE TEST RESULTS', 100)
    variance_results = analysis.calculate_variance_analysis(segment_df, variant_column, metric_column)

    is_equal_variance = analysis.set_equal_variance_flag(variance_results, alpha=0.05)
    print(f'\nUSE EQUAL VARIANCE TESTS: {is_equal_variance}')

    ## STATISICAL TESTING
    print_title('STATISICAL TESTING', 130, ':')
    ### TESTS AND TECHNIQUES SELECTION
    print_title('TEST VARIABLES', 100)
    ab_test_config = ab_tests.configure_ab_test(metric_type, is_equal_variance, is_normal_distribution, num_variants, variant_ratio, sample_sizes, is_strong_outlier_effect)
    print('\n')
    print_title('TEST SELECTION', 100)
    print({key: value for key, value in ab_test_config.items() if value})

    ### UNBALANCE DATA
    segment_df = ab_tests.resample_data(segment_df, ab_test_config, variant_column)

    ### TESTS
    print_title('TEST RESULTS', 100)
    standardized_results = ab_tests.run_complete_ab_test(ab_test_config, selected_df, variant_column, metric_column, num_variants, alpha = 0.05)
    display(standardized_results)

    print_title('TEST SELECTION', 100)
    print({key: value for key, value in ab_test_config.items() if value})

    framework.plot_distributions(selected_df, variant_column, metric_column, 0.05)

    ### ADDITIONAL TECHNIQUES
    print_title('ADDITIONAL TECHNIQUES', 100)
    additional_tests_results = ab_tests.apply_additional_tests(ab_test_config, selected_df, variant_column, metric_column)
    display(additional_tests_results)

### INTERACTIONS TESTS

In [None]:
## Interaction Tests, if discrepancies between segments
use_anova_interaction_test_segmentation = False # to test interaction effect between variant and segment, continuous, normal
use_welch_anova_interaction_test_segmentation = False # to test interaction effect between variant and segment, continuous, normal, can unbalanced
use_kruskal_wallis_interaction_test_segmentation = False # to test interaction effect between variant and segment, continuous, not normal
use_logistic_regression_interaction_test_segmentation = False # to test interaction effect for proportions, equivalent to ANOVA for categorical data
# if true, post hoc with tukey, games howell or dunn

In [None]:
interaction_tests = ab_tests.select_interaction_test(selected_df, variant_column, metric_column, segment_column, metric_type)

print(interaction_tests)

In [None]:
interaction_results = ab_tests.run_interaction_tests( selected_df, variant_column, metric_column, segment_column, interaction_tests)

for test, result in interaction_results.items():
    print(f"\n{test}:\n", result)