<a href="https://colab.research.google.com/github/DiogoMondin/ab-test-analysis/blob/main/T%26D_AB_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
import kagglehub
import os

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import scipy.stats as stats
import statsmodels.stats.proportion as proportion
import statsmodels.stats.power as power
from statsmodels.stats.proportion import proportion_effectsize

from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import warnings
warnings.filterwarnings('ignore')

 # Data Input

In [2]:
path = kagglehub.dataset_download("faviovaz/marketing-ab-testing")

for root, dirs, files in os.walk(path):
    for file in files:
        print(os.path.join(root, file))

/kaggle/input/marketing-ab-testing/marketing_AB.csv


# Data Exploring

In [3]:
ab_test_pd = pd.read_csv("/kaggle/input/marketing-ab-testing/marketing_AB.csv")
ab_test_pd.describe()

Unnamed: 0.1,Unnamed: 0,user id,total ads,most ads hour
count,588101.0,588101.0,588101.0,588101.0
mean,294050.0,1310692.0,24.820876,14.469061
std,169770.279667,202226.0,43.715181,4.834634
min,0.0,900000.0,1.0,0.0
25%,147025.0,1143190.0,4.0,11.0
50%,294050.0,1313725.0,13.0,14.0
75%,441075.0,1484088.0,27.0,18.0
max,588100.0,1654483.0,2065.0,23.0


In [4]:
spark = SparkSession.builder \
    .appName("Marketing_AB_Testing_Analysis") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

ab_test_df = spark.createDataFrame(ab_test_pd)

# 1. Group Conversion Analysis

In [5]:
# Calculate conversion rate
conversion_results = (
    ab_test_df
    .groupBy("test group")
    .agg(
        F.count("*").alias("total_users"),
        F.sum(F.when(F.col("converted") == True, 1).otherwise(0)).alias("conversions"),
        F.avg(F.when(F.col("converted") == True, 1).otherwise(0)).alias("conversion_rate")
    )
    .collect()
)

# Extract data from clusters
ad_data = [r for r in conversion_results if r["test group"] == "ad"][0]
psa_data = [r for r in conversion_results if r["test group"] == "psa"][0]

# Statistics variables
ad_successes = ad_data["conversions"]
ad_total = ad_data["total_users"]
psa_successes = psa_data["conversions"]
psa_total = psa_data["total_users"]

ad_rate = ad_data["conversion_rate"]
psa_rate = psa_data["conversion_rate"]
lift = ((ad_rate - psa_rate) / psa_rate) * 100

print(f"\n Group analysis:")
print(f"   AD: {ad_rate:.1%} conversion ({ad_successes:,}/{ad_total:,})")
print(f"   PSA: {psa_rate:.1%} conversion ({psa_successes:,}/{psa_total:,})")
print(f"   Lift: {lift:+.1f}%")


 Group analysis:
   AD: 2.6% conversion (14,423/564,577)
   PSA: 1.8% conversion (420/23,524)
   Lift: +43.1%


# 2. Advanced Metrics

#### 2.1 Z-test

In [27]:
z_stat, p_value = proportion.proportions_ztest(
    [ad_successes, psa_successes],
    [ad_total, psa_total]
)
print(f"   Z: {z_stat:.4f}")
print(f"   P-value: {p_value:.16f} ({'Significant' if p_value < 0.07 else 'Non significant'})")

   Z: 7.3701
   P-value: 0.0000000000001705 (Significant)


#### 2.2 IC

In [39]:
try:
    ci_lower, ci_upper = proportion.confint_proportions_2indep(
        ad_successes, ad_total, psa_successes, psa_total, alpha=0.05
    )
    difference = ad_rate - psa_rate
    print(f"   Difference: {difference:.4f} ({difference*100:+.2f}%)")
    print(f"   IC 95%: [{ci_lower:.4f}, {ci_upper:.4f}]")
    print(f"   Margin of error: ±{(ci_upper - ci_lower)/2:.4f}")

    # Interpretation do IC
    if ci_lower > 0:
        ci_interpretation = "AD group is significantly better"
    elif ci_upper < 0:
        ci_interpretation = "PSA group is significantly better"
    else:
        ci_interpretation = "No significant difference"
    print(f"   Conclusion: {ci_interpretation}")

except Exception as e:
    print(f"   IC error: {e}")
    difference = ad_rate - psa_rate
    print(f"   Difference: {difference:.4f} ({difference*100:+.2f}%)")

   Difference: 0.0077 (+0.77%)
   IC 95%: [0.0059, 0.0094]
   Margin of error: ±0.0017
   Conclusion: AD group is significantly better


#### 2.3 TAMANHO DO EFEITO - Cohen's h

In [29]:
effect_size = proportion.proportion_effectsize(ad_rate, psa_rate)
if abs(effect_size) >= 0.8:
    effect_interpretation = "Big"
elif abs(effect_size) >= 0.5:
    effect_interpretation = "Average"
elif abs(effect_size) >= 0.2:
    effect_interpretation = "Small"
else:
    effect_interpretation = "Very Small"

print(f"   Cohen's h: {effect_size:.4f} ({effect_interpretation})")

   Cohen's h: 0.0530 (Very Small)


#### 2.4 Statistic Power (Reduced sample)

In [30]:
min_sample_size = min(ad_total, psa_total)
observed_power = power.ttest_power(effect_size, min_sample_size, alpha=0.05, alternative='two-sided')

print(f"   Observed Statistic Power: {observed_power:.3f} ({observed_power*100:.1f}%)")
print(f"   Status: {'Adequate' if observed_power >= 0.8 else 'Low'} (ideal ≥80%)")

   Observed Statistic Power: 1.000 (100.0%)
   Status: Adequate (ideal ≥80%)


#### 2.5 Alternative tests

In [31]:
# Chi²
contingency = np.array([
    [ad_successes, ad_total - ad_successes],
    [psa_successes, psa_total - psa_successes]
])
chi2, p_chi2, dof, expected = stats.chi2_contingency(contingency)
print(f"   Chi²: {chi2:.4f} (p = {p_chi2:.16f})")

# Fisher
odds_ratio, p_fisher = stats.fisher_exact(contingency)
print(f"   Fisher: OR = {odds_ratio:.4f} (p = {p_fisher:.16f})")

   Chi²: 54.0058 (p = 0.0000000000001999)
   Fisher: OR = 1.4421 (p = 0.0000000000000105)


#### 2.6 Standard Error and premise

In [38]:
# Standard error for each group
ad_se = np.sqrt(ad_rate * (1 - ad_rate) / ad_total)
psa_se = np.sqrt(psa_rate * (1 - psa_rate) / psa_total)
print(f"   Standard Error AD group: {ad_se:.6f}")
print(f"   Standard Error PSA group: {psa_se:.6f}")

min_expected = min(ad_successes, ad_total - ad_successes,
                  psa_successes, psa_total - psa_successes)
premises_ok = min_expected >= 5
print(f"   Premise fulfilled: {'Yes' if premises_ok else 'No'} (minimum expected: {min_expected:.1f})")

   Standard Error AD group: 0.000210
   Standard Error PSA group: 0.000863
   Premise fulfilled: Yes (minimum expected: 420.0)


# 3. Necessary sample size calculation

In [33]:
min_detectable_effect = 0.01

effect_size = proportion_effectsize(psa_rate, psa_rate + min_detectable_effect)

analysis = power.NormalIndPower()

required_n = analysis.solve_power(
    effect_size=effect_size,
    power=0.8,
    alpha=0.05,
    alternative='two-sided'
)

print(f"   To detect 1% differece:")
print(f"   • Necessary size: {required_n:.0f} per group")
print(f"   • Current Size: {min(ad_total, psa_total):,} per group")
print(f"   • Status: {'Adequate' if min(ad_total, psa_total) >= required_n else 'Insufficient'}")

   To detect 1% differece:
   • Necessary size: 3464 per group
   • Current Size: 23,524 per group
   • Status: Adequate


# 4. Summary

In [37]:
print(f"Performance:")
print(f"   • AD: {ad_rate:.1%} ({ad_successes:,}/{ad_total:,})")
print(f"   • PSA: {psa_rate:.1%} ({psa_successes:,}/{psa_total:,})")
print(f"   • Lift: {lift:+.1f}%")

print(f"\n Statistic Significance")
print(f"   • P-value: {p_value:.16f}")
print(f"   • Result: {'Significant' if p_value < 0.05 else 'Non significant'}")
print(f"   • Effect: {effect_interpretation}")

print(f"\n Reliability:")
print(f"   • Statistic Power: {observed_power:.1%}")
print(f"   • Premise: {'Fulfilled' if premises_ok else 'Not fulfilled'}")

print(f"\n Recommendation:")
if p_value < 0.05 and observed_power >= 0.8:
    if ad_rate > psa_rate:
        recommendation = "IMPROVE AD group - Reliable and Significant Result"
    else:
        recommendation = "KEEP PSA group - PSA is significantly better"
elif p_value >= 0.05:
    recommendation = "PROCEED TEST - No significant difference"
else:
    recommendation = "INCREASE SAMPLE - Low statistic power"

print(f"   {recommendation}")

Performance:
   • AD: 2.6% (14,423/564,577)
   • PSA: 1.8% (420/23,524)
   • Lift: +43.1%

 Statistic Significance
   • P-value: 0.0000000000001705
   • Result: Significant
   • Effect: Very Small

 Reliability:
   • Statistic Power: 100.0%
   • Premise: Fulfilled

 Recommendation:
   IMPROVE AD group - Reliable and Significant Result
