In [18]:
import numpy as np
import pandas as pd
from scipy import stats
from scipy.stats import chi2_contingency
from statsmodels.stats.multitest import multipletests
from typing import Dict, List, Tuple, Optional
import warnings

warnings.filterwarnings("ignore",category=UserWarning)

# Try to import validation module
try:
    from validation import ExperimentValidator
    VALIDATION_AVAILABLE = True
except ImportError:
    VALIDATION_AVAILABLE = False
    warnings.warn("Validation module not available. Skipping validation checks.")


class ABTestAnalyzer:
    
    def __init__(self, alpha: float = 0.05):
        self.alpha = alpha
        if VALIDATION_AVAILABLE:
            self.validator = ExperimentValidator(srm_threshold=0.001)  # Stricter for SRM
        else:
            self.validator = None
    
    def calculate_sample_size(self,
                            baseline_rate: float,
                            mde: float,
                            alpha: float = 0.05,
                            power: float = 0.80,
                            two_tailed: bool = True) -> int:
        
        if two_tailed:
            z_alpha = stats.norm.ppf(1 - alpha/2)
        else:
            z_alpha = stats.norm.ppf(1 - alpha)
        
        z_beta = stats.norm.ppf(power)
  
        p1 = baseline_rate
        p2 = baseline_rate * (1 + mde)
        
        
        p2 = min(p2, 0.999)
        
        numerator = (z_alpha + z_beta) ** 2 * (p1 * (1 - p1) + p2 * (1 - p2))
        denominator = (p2 - p1) ** 2
        
        n = numerator / denominator
        
        return int(np.ceil(n))
    
    def two_sample_ttest(self,
                        control: np.ndarray,
                        treatment: np.ndarray,
                        metric_name: str,
                        equal_var: bool = False) -> Dict:
        
        control = control[~np.isnan(control)]
        treatment = treatment[~np.isnan(treatment)]
        
        control_mean = control.mean()
        treatment_mean = treatment.mean()
        control_std = control.std(ddof=1)
        treatment_std = treatment.std(ddof=1)
        n_control = len(control)
        n_treatment = len(treatment)
        
        statistic, pvalue = stats.ttest_ind(treatment, control, equal_var=equal_var)
        
        pooled_std = np.sqrt((control_std**2 + treatment_std**2) / 2)
        cohens_d = (treatment_mean - control_mean) / pooled_std if pooled_std > 0 else 0
        
        se_diff = np.sqrt(control_std**2/n_control + treatment_std**2/n_treatment)
        
        if not equal_var:
            num = (control_std**2/n_control + treatment_std**2/n_treatment)**2
            denom = ((control_std**2/n_control)**2/(n_control-1) + 
                    (treatment_std**2/n_treatment)**2/(n_treatment-1))
            df = num / denom if denom > 0 else n_control + n_treatment - 2
        else:
            df = n_control + n_treatment - 2
        
        t_crit = stats.t.ppf(1 - self.alpha/2, df)
        diff = treatment_mean - control_mean
        ci_lower = diff - t_crit * se_diff
        ci_upper = diff + t_crit * se_diff
        
        relative_lift_pct = (diff / control_mean * 100) if control_mean != 0 else 0
        
        return {
            'metric': metric_name,
            'test_type': 't-test',
            'statistic': statistic,
            'pvalue': pvalue,
            'significant': pvalue < self.alpha,
            'control_mean': control_mean,
            'treatment_mean': treatment_mean,
            'control_std': control_std,
            'treatment_std': treatment_std,
            'absolute_diff': diff,
            'relative_lift_pct': relative_lift_pct,
            'cohens_d': cohens_d,
            'effect_interpretation': self._interpret_cohens_d(cohens_d),
            'ci_lower': ci_lower,
            'ci_upper': ci_upper,
            'n_control': n_control,
            'n_treatment': n_treatment,
            'degrees_of_freedom': df
        }
    
    def proportion_test(self,
                       control_successes: int,
                       control_total: int,
                       treatment_successes: int,
                       treatment_total: int,
                       metric_name: str) -> Dict:
        
        p_control = control_successes / control_total
        p_treatment = treatment_successes / treatment_total
        
        p_pooled = (control_successes + treatment_successes) / (control_total + treatment_total)
        
        se = np.sqrt(p_pooled * (1 - p_pooled) * (1/control_total + 1/treatment_total))
        
        z_stat = (p_treatment - p_control) / se if se > 0 else 0
        
        pvalue = 2 * (1 - stats.norm.cdf(abs(z_stat)))
        
        se_diff = np.sqrt(p_control*(1-p_control)/control_total + 
                         p_treatment*(1-p_treatment)/treatment_total)
        z_crit = stats.norm.ppf(1 - self.alpha/2)
        diff = p_treatment - p_control
        ci_lower = diff - z_crit * se_diff
        ci_upper = diff + z_crit * se_diff
        
        relative_lift_pct = (diff / p_control * 100) if p_control > 0 else 0
        
        return {
            'metric': metric_name,
            'test_type': 'proportion_test',
            'statistic': z_stat,
            'pvalue': pvalue,
            'significant': pvalue < self.alpha,
            'control_rate': p_control,
            'treatment_rate': p_treatment,
            'absolute_diff': diff,
            'relative_lift_pct': relative_lift_pct,
            'ci_lower': ci_lower,
            'ci_upper': ci_upper,
            'n_control': control_total,
            'n_treatment': treatment_total
        }
    ## step 2

    def chi_square_test(self,
                       control: np.ndarray,
                       treatment: np.ndarray,
                       metric_name: str) -> Dict:
        
        combined = np.concatenate([control, treatment])
        labels = np.concatenate([np.zeros(len(control)), np.ones(len(treatment))])
        
        contingency_table = pd.crosstab(combined, labels)
        
        chi2, pvalue, dof, expected = chi2_contingency(contingency_table)

        n = len(combined)
        min_dim = min(contingency_table.shape[0], contingency_table.shape[1]) - 1
        cramers_v = np.sqrt(chi2 / (n * min_dim)) if min_dim > 0 else 0
        
        return {
            'metric': metric_name,
            'test_type': 'chi_square',
            'statistic': chi2,
            'pvalue': pvalue,
            'significant': pvalue < self.alpha,
            'degrees_of_freedom': dof,
            'cramers_v': cramers_v,
            'effect_interpretation': self._interpret_cramers_v(cramers_v),
            'n_control': len(control),
            'n_treatment': len(treatment)
        }
    
    def mann_whitney_u_test(self,
                           control: np.ndarray,
                           treatment: np.ndarray,
                           metric_name: str) -> Dict:

        

        control = control[~np.isnan(control)]
        treatment = treatment[~np.isnan(treatment)]
        

        statistic, pvalue = stats.mannwhitneyu(treatment, control, alternative='two-sided')
        

        n1 = len(control)
        n2 = len(treatment)
        rank_biserial = 1 - (2*statistic) / (n1 * n2)
        

        control_median = np.median(control)
        treatment_median = np.median(treatment)
        
        return {
            'metric': metric_name,
            'test_type': 'mann_whitney',
            'statistic': statistic,
            'pvalue': pvalue,
            'significant': pvalue < self.alpha,
            'control_median': control_median,
            'treatment_median': treatment_median,
            'rank_biserial': rank_biserial,
            'n_control': n1,
            'n_treatment': n2
        }
    
    def bootstrap_confidence_interval(self,
                                     control: np.ndarray,
                                     treatment: np.ndarray,
                                     metric_name: str,
                                     n_bootstrap: int = 10000,
                                     confidence_level: float = 0.95) -> Dict:
        
        np.random.seed(42)
        

        control = control[~np.isnan(control)]
        treatment = treatment[~np.isnan(treatment)]
        
 
        boot_diffs = []
        for _ in range(n_bootstrap):
            control_boot = np.random.choice(control, size=len(control), replace=True)
            treatment_boot = np.random.choice(treatment, size=len(treatment), replace=True)
            boot_diffs.append(treatment_boot.mean() - control_boot.mean())
        
        boot_diffs = np.array(boot_diffs)
        

        alpha_bootstrap = 1 - confidence_level
        ci_lower = np.percentile(boot_diffs, alpha_bootstrap/2 * 100)
        ci_upper = np.percentile(boot_diffs, (1 - alpha_bootstrap/2) * 100)
        
 
        observed_diff = treatment.mean() - control.mean()

        significant = not (ci_lower <= 0 <= ci_upper)
        
        return {
            'metric': metric_name,
            'test_type': 'bootstrap',
            'observed_diff': observed_diff,
            'ci_lower': ci_lower,
            'ci_upper': ci_upper,
            'significant': significant,
            'confidence_level': confidence_level,
            'n_bootstrap': n_bootstrap
        }
    
    def multiple_testing_correction(self,
                                   p_values: List[float],
                                   method: str = 'holm') -> Dict:


        reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(
            p_values, 
            alpha=self.alpha, 
            method=method
        )
        
        fwer_uncorrected = 1 - (1 - self.alpha) ** len(p_values)
        
        return {
            'method': method,
            'original_pvalues': p_values,
            'corrected_pvalues': pvals_corrected.tolist(),
            'reject': reject.tolist(),
            'fwer_uncorrected': fwer_uncorrected,
            'num_tests': len(p_values),
            'num_significant_uncorrected': sum(p < self.alpha for p in p_values),
            'num_significant_corrected': sum(reject)
        }
        # --- Logic step 3
    def _check_normality(self, data):
        # Sesuai instruksi: Shapiro-Wilk
        _, p_val = stats.shapiro(data)
        return p_val > self.alpha

    def analyze_test(self, df, metrics_dict):
        # Logic: Pilih test otomatis
        results = []
        for metric, m_type in metrics_dict.items():
            control = df[df['variant'] == df['variant'].unique()[0]][metric].values
            treatment = df[df['variant'] == df['variant'].unique()[1]][metric].values
            
            if m_type == 'binary':
                # Panggil fungsi mentor: proportion_test
                res = self.proportion_test(sum(control), len(control), sum(treatment), len(treatment), metric)
            else:
                # Cek normalitas dulu
                if self._check_normality(df[metric].values):
                    # Panggil fungsi mentor: two_sample_ttest
                    res = self.two_sample_ttest(control, treatment, metric)
                else:
                    # Panggil fungsi mentor: mann_whitney_u_test
                    res = self.mann_whitney_u_test(control, treatment, metric)
            results.append(res)
        return pd.DataFrame(results)

AB Test Analyzer test1_menu.csv

In [19]:
# 1. Load Data
df1 = pd.read_csv(r"D:\@02 Personal\@03 Mini Data Project\@07 DEC Portfolio\data\raw\test1_menu.csv", sep='^')
df1.columns = df1.columns.str.strip()

# 2. Inisialisasi
analyzer = ABTestAnalyzer()

# 3. Definisikan Metrik (Cukup list metrik dan jenisnya saja)
file1_metrics = {
    'added_to_cart': 'binary',
    'revenue': 'continuous',
    'pages_viewed': 'continuous'
}

# 4. JALANKAN ANALISIS (Satu perintah untuk semua)
report_file1 = analyzer.analyze_test(df1, file1_metrics)

# 5. Tampilkan Hasil Akhir
display(report_file1)


Unnamed: 0,metric,test_type,statistic,pvalue,significant,control_rate,treatment_rate,absolute_diff,relative_lift_pct,ci_lower,ci_upper,n_control,n_treatment,control_median,treatment_median,rank_biserial
0,added_to_cart,proportion_test,-14.68221,0.0,True,0.961714,0.862286,-0.099429,-10.338681,-0.112496,-0.086362,3500,3500,,,
1,revenue,mann_whitney,5653101.0,2.377455e-08,True,,,,,,,3500,3500,2.862354,2.602078,0.077045
2,pages_viewed,mann_whitney,5970453.0,0.06748254,False,,,,,,,3500,3500,2.171813,2.129282,0.025232


AB Test Analyzer test2_novelty_slider.csv

In [20]:
# PROSES FILE 2
df2 = pd.read_csv(r"D:\@02 Personal\@03 Mini Data Project\@07 DEC Portfolio\data\raw\test2_novelty_slider.csv", sep=',') # Sesuaikan path & sep
df2.columns = df2.columns.str.strip()

# Panggil class ABTestAnalyzer()
analyzer = ABTestAnalyzer()
metrics_file2 = {
    'is_registered': 'binary',
    'novelty_revenue': 'continuous',
    'products_added_from_novelties': 'continuous'
}
report_file2 = analyzer.analyze_test(df2, metrics_file2)
display(report_file2)

Unnamed: 0,metric,test_type,statistic,pvalue,significant,control_rate,treatment_rate,absolute_diff,relative_lift_pct,ci_lower,ci_upper,n_control,n_treatment,control_median,treatment_median,rank_biserial
0,is_registered,proportion_test,-0.1271251,0.8988414,False,0.450625,0.449625,-0.001,-0.221914,-0.016418,0.014418,8000,8000,,,
1,novelty_revenue,mann_whitney,33612110.0,3.418849e-08,True,,,,,,,8000,8000,3.773003,3.981853,-0.050378
2,products_added_from_novelties,mann_whitney,32136000.0,7.735514e-06,True,,,,,,,8000,8000,0.0,0.0,-0.00425


AB Test Analyzer File 3 test3_product_sliders.csv

In [21]:
#Proses File 3
df3 = pd.read_csv(r"D:\@02 Personal\@03 Mini Data Project\@07 DEC Portfolio\data\raw\test3_product_sliders.csv", sep=',') # Sesuaikan path & sep
df3.columns = df3.columns.str.strip()

# Panggil class ABTestAnalyzer()
analyzer = ABTestAnalyzer()
metrics_file3 = {
    'add_to_cart_rate': 'continuous',
    'slider_interactions': 'continuous',
    'revenue_from_recommendations': 'continuous',
    'products_per_order': 'continuous',
    'avg_product_price': 'continuous'
}
report_file3 = analyzer.analyze_test(df3, metrics_file3)
display(report_file3)

Unnamed: 0,metric,test_type,statistic,pvalue,significant,control_median,treatment_median,rank_biserial,n_control,n_treatment
0,add_to_cart_rate,mann_whitney,18000000.0,1.0,False,0.0,0.0,0.0,6000,6000
1,slider_interactions,mann_whitney,18324101.5,0.08053754,False,2.0,2.0,-0.018006,6000,6000
2,revenue_from_recommendations,mann_whitney,20833813.0,1.9542269999999997e-50,True,3.716052,4.504417,-0.157434,6000,6000
3,products_per_order,mann_whitney,16760421.5,6.451671e-11,True,3.156163,3.053662,0.068865,6000,6000
4,avg_product_price,mann_whitney,21220380.0,1.3183549999999998e-64,True,3.010336,3.419365,-0.17891,6000,6000


AB Test Analyzer File 4 test4_reviews.csv

In [15]:
#Proses File 4
df4 = pd.read_csv(r"D:\@02 Personal\@03 Mini Data Project\@07 DEC Portfolio\data\raw\test4_reviews.csv", sep=',') # Sesuaikan path & sep
df4.columns = df4.columns.str.strip()

# Panggil class ABTestAnalyzer()
analyzer = ABTestAnalyzer()
metrics_file4 = {
   'converted': 'binary',
    'added_to_cart': 'binary'
}
report_file4 = analyzer.analyze_test(df4, metrics_file4)
display(report_file4)

Unnamed: 0,metric,test_type,statistic,pvalue,significant,control_rate,treatment_rate,absolute_diff,relative_lift_pct,ci_lower,ci_upper,n_control,n_treatment
0,converted,proportion_test,0.284027,0.776389,False,0.106667,0.107524,0.000857,0.803571,-0.005058,0.006772,21000,21000
1,added_to_cart,proportion_test,1.192175,0.233193,False,0.826762,0.831143,0.004381,0.529893,-0.002821,0.011583,21000,21000


AB Test Analyzer File 5 test5_search_engine

In [22]:
#Proses File 5
df5 = pd.read_csv(r"D:\@02 Personal\@03 Mini Data Project\@07 DEC Portfolio\data\raw\test5_search_engine.csv", sep=',') # Sesuaikan path & sep
df5.columns = df5.columns.str.strip()

# Panggil class ABTestAnalyzer()
analyzer = ABTestAnalyzer()
metrics_file5 = {
   'interacted_with_search': 'binary',
    'added_to_cart': 'binary',
    'converted': 'binary',
    'avg_revenue_per_visitor': 'continuous'
}
report_file5 = analyzer.analyze_test(df5, metrics_file5)
display(report_file5)

Unnamed: 0,metric,test_type,statistic,pvalue,significant,control_rate,treatment_rate,absolute_diff,relative_lift_pct,ci_lower,ci_upper,n_control,n_treatment,control_median,treatment_median,rank_biserial
0,interacted_with_search,proportion_test,-0.7468953,0.455127,False,0.349368,0.344211,-0.005158,-1.476348,-0.018693,0.008377,9500,9500,,,
1,added_to_cart,proportion_test,3.199683,0.001376,True,0.898737,0.912316,0.013579,1.510892,0.005263,0.021894,9500,9500,,,
2,converted,proportion_test,0.894315,0.371153,False,0.066211,0.069474,0.003263,4.928458,-0.003888,0.010414,9500,9500,,,
3,avg_revenue_per_visitor,mann_whitney,45481190.0,0.346066,False,,,,,,,9500,9500,0.692879,0.69305,-0.007893
