# Experiment Analysis

## Loading the CSV file

In [None]:
import pandas as pd

df = pd.read_csv("../data/final_experiment_dataset.csv")
final_experiment_df = df.copy()
final_experiment_df.head(5)

In [None]:
final_experiment_df.shape

## Controlling for bias between groups

In [None]:
final_experiment_df["Variation"].value_counts()

In [None]:
final_experiment_df.groupby("Variation")[[
    "clnt_age",
    "clnt_tenure_yr",
    "num_accts",
    "calls_6_mnth",
    "logons_6_mnth"
]].mean()


In [None]:
# Baseline demographics and engagement metrics are well balanced between the control and test groups, indicating successful random assignment and low risk of selection bias.

In [None]:
from scipy.stats import ttest_ind

ctrl = final_experiment_df[final_experiment_df["Variation"] == "Control"]["clnt_age"]
test = final_experiment_df[final_experiment_df["Variation"] == "Test"]["clnt_age"]

ttest_ind(ctrl, test, nan_policy="omit")


p-value â‰ˆ 0.205
This is well above common thresholds (0.05 or 0.01).

Interpretation
We fail to reject the null hypothesis that the mean age is the same in Control and Test.

What that means here:
There is no statistically significant age difference between the two groups.
That supports good randomization and low selection bias.

## KPIs

### Client level diagnostics

- KPI #1: Conversion Rate: We define "Coversion Rate" as the ratio between successful completion of sessions (from start to confirm) compared to the total amount ot sessions initiated. 

In [None]:
# for that we need to group by "Variation" and check for "converted", calculating the mean(), and hsowing it by percentage
conversion_by_group = final_experiment_df.groupby("Variation")["converted"].mean()

(conversion_by_group * 100).round(2)

In [None]:
# now let's test for the statistical significance of our result
import numpy as np

conv_table = (
    final_experiment_df
    .groupby("Variation")["converted"]
    .agg(["sum", "count"])
)

conv_table

x_test  = conv_table.loc["Test", "sum"]
n_test  = conv_table.loc["Test", "count"]

x_ctrl  = conv_table.loc["Control", "sum"]
n_ctrl  = conv_table.loc["Control", "count"]

from statsmodels.stats.proportion import proportions_ztest

stat, pval = proportions_ztest(
    count=np.array([x_test, x_ctrl]),
    nobs=np.array([n_test, n_ctrl]),
    alternative="two-sided"
)

stat, pval


In [None]:
# Our null hypothesis is that the difference in the conversion rate in not significant. 
# after running our statistical test above, we can see that we get an incredibly small p-value, much lower than 0.05.
# hence our result is significant.

- KPI #2: Average Total Time to Completion: What is average time duration that takes a converting customer to complete a session.

In [None]:
converters_df = final_experiment_df[final_experiment_df["converted"]].copy()
converters_df

In [None]:
converters_df["total_duration_sec"].max()

In [None]:
# by checking the ax value of "total_duration_sec" we understand that there are some outliers in our groups that are driving the avergages to crazy levels. Therefore we need to find them and exclude them,
q1 = converters_df["total_duration_sec"].quantile(0.25)
q3 = converters_df["total_duration_sec"].quantile(0.75)
iqr = q3 - q1

lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr

q1, q3, iqr, lower, upper


In [None]:
# therefore we take as the absolute uppoer limit of session duration the 1492 seconds.
outliers = converters_df[
    (converters_df["total_duration_sec"] < lower) |
    (converters_df["total_duration_sec"] > upper)
]

outliers.shape


In [None]:
# and we will remove those rows
conv_no_outliers_df = converters_df[
    (converters_df["total_duration_sec"] >= lower) &
    (converters_df["total_duration_sec"] <= upper)
].copy()

conv_no_outliers_df.shape

In [None]:
avg_time_by_group = conv_no_outliers_df.groupby("Variation")["total_duration_sec"].mean()
avg_time_by_group

In [None]:
# i will convert that to minutes
(avg_time_by_group / 60).round(2)

In [None]:
# now let's test for the statistical significance of our result
test_time = conv_no_outliers[conv_no_outliers["Variation"] == "Test"]["total_duration_sec"].values
ctrl_time = conv_no_outliers[conv_no_outliers["Variation"] == "Control"]["total_duration_sec"].values

import numpy as np

n_boot = 10000
boot_diffs = []

for _ in range(n_boot):
    boot_test = np.random.choice(test_time, size=len(test_time), replace=True)
    boot_ctrl = np.random.choice(ctrl_time, size=len(ctrl_time), replace=True)
    boot_diffs.append(boot_test.mean() - boot_ctrl.mean())

boot_diffs = np.array(boot_diffs)

ci_low, ci_high = np.percentile(boot_diffs, [2.5, 97.5])
ci_low, ci_high


In [None]:
# the value of "0" is not between ci_low and ci_high, which makes our result statistically significant.

In [None]:
# checking our interpretation with the p-value
p_value = np.mean(np.abs(boot_diffs) <= 0)
p_value


- KPI #3: STD of Total Time to Completion: A smooth UI reduced variability.

In [None]:
std_per_group = conv_no_outliers_df.groupby("Variation")["total_duration_sec"].std()

(std_per_group / 60).round(2)

In [None]:
# our null hypothesis is that there is NO significant difference in standard deviation between the two groups

In [None]:
from scipy.stats import levene

test_time = conv_no_outliers[conv_no_outliers["Variation"] == "Test"]["total_duration_sec"]
ctrl_time = conv_no_outliers[conv_no_outliers["Variation"] == "Control"]["total_duration_sec"]

stat, pval = levene(test_time, ctrl_time)
stat, pval


In [None]:
# with a p-value > 0.05, we fail to reject th null hypothesis. Therefore, we cannot say that the standard deviation in the test group is bigger with statistical significance. We cannot rule out random variation.

- KPI #4: Average number of events (steps) per client: more steps indicate confusion with the UI.

In [None]:
final_experiment_df["n_events"].nunique()

In [None]:
# let's explore for outliers
q1_a = final_experiment_df["n_events"].quantile(0.25)
q3_a = final_experiment_df["n_events"].quantile(0.75)
iqr_a = q3_a - q1_a

lower_a = q1_a - 1.5 * iqr_a
upper_a = q3_a + 1.5 * iqr_a

lower_a, upper_a

In [None]:
# therefore we take as the absolute uppoer limit of session duration the 11.5 seconds.
outliers_events = final_experiment_df[
    (final_experiment_df["n_events"] < lower_a) |
    (final_experiment_df["n_events"] > upper_a)
]

outliers_events.shape

In [None]:
# and we will remove those rows
attempts_per_client_df = final_experiment_df[
    (final_experiment_df["total_duration_sec"] >= lower) &
    (final_experiment_df["total_duration_sec"] <= upper)
].copy()

attempts_per_client_df.shape

In [None]:
attempts_per_client = attempts_per_client_df.groupby("Variation")["n_events"].mean()
attempts_per_client

In [None]:
# statistical test to undersatnd significance
import numpy as np

test_attempts = final_experiment_df[
    final_experiment_df["Variation"] == "Test"
]["n_events"].values

ctrl_attempts = final_experiment_df[
    final_experiment_df["Variation"] == "Control"
]["n_events"].values

n_boot = 10000
boot_diffs_2 = []

for _ in range(n_boot):
    boot_test = np.random.choice(test_attempts, size=len(test_attempts), replace=True)
    boot_ctrl = np.random.choice(ctrl_attempts, size=len(ctrl_attempts), replace=True)
    boot_diffs_2.append(boot_test.mean() - boot_ctrl.mean())

boot_diffs_2 = np.array(boot_diffs_2)

ci_low, ci_high = np.percentile(boot_diffs_2, [2.5, 97.5])
ci_low, ci_high


In [None]:
p_value = np.mean(np.abs(boot_diffs_2) <= 0)
print(f"p-value = {p_value:.2e}")

In [None]:
# since "0" is not part of our ci_low and ci_high, then the difference in n_events is statistically significant.

### Step-level diagnostics

- KPI #5: Time per step: useful to identify bootlenecks and moments where the user struggles with the UI.

- KPI #6: Drop-off rate per step: Where do users mostly abandon?

- KPI #7: Error rate per step: how often users move backward or hit errors.

- KPI #8: Anomaly rate: irregular step jumps or session fragmentation, indicate improper use of the UI.

In [None]:
import pandas as pd

df = pd.read_csv("../data/final_per_step_dataset.csv")
final_per_step_df = df.copy()
final_per_step_df.head(5)

In [None]:
error_per_step = final_per_step_df.groupby(["Variation", "process_step"])["step_anomaly"].mean()
error_per_step

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
VANGUARD_PALETTE = {
    "red":      "#96151D",  # anchor
    "charcoal": "#222222",
    "slate":    "#4B5563",
    "steel":    "#64748B",
    "blue":     "#1F4E79",
    "teal":     "#0F766E",
    "gold":     "#B08900",
    "light":    "#E5E7EB",
}

# handy ordered list for seaborn/matplotlib
VANGUARD_COLORS = [
    VANGUARD_PALETTE["red"],
    VANGUARD_PALETTE["blue"],
    VANGUARD_PALETTE["teal"],
    VANGUARD_PALETTE["gold"],
    VANGUARD_PALETTE["slate"],
    VANGUARD_PALETTE["steel"],
    VANGUARD_PALETTE["charcoal"],
    VANGUARD_PALETTE["light"],
]
plt.figure(figsize=(10, 5))
sns.barplot(
    data=final_per_step_df,
    x="process_step",
    y=final_per_step_df["step_anomaly"] / 60,
    hue="Variation",
    hue_order=["Control", "Test"],   # key line
    palette=[VANGUARD_PALETTE["red"], VANGUARD_PALETTE["teal"]]
)

plt.title("Average anomalies per Step (Test vs Control)")
plt.xlabel("Process Step")
plt.ylabel("Average Anomaly")
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()

In [None]:
# Check if there are any anomaly at step_2, by group
step_2_anomaly = (
    final_per_step_df[final_per_step_df["process_step"] == "step_2"]
    .groupby("Variation")["step_anomaly"]
    .agg(anomaly_count="sum", step_2_rows="count", anomaly_rate="mean")
)

step_2_anomaly


In [None]:
# Check if there are any anomaly at step_3, by group
step_3_anomaly = (
    final_per_step_df[final_per_step_df["process_step"] == "step_3"]
    .groupby("Variation")["step_anomaly"]
    .agg(anomaly_count="sum", step_3_rows="count", anomaly_rate="mean")
)

step_3_anomaly

In [None]:
# Check if there are any anomaly at confirm, by group
step_4_anomaly = (
    final_per_step_df[final_per_step_df["process_step"] == "confirm"]
    .groupby("Variation")["step_anomaly"]
    .agg(anomaly_count="sum", confirm_rows="count", anomaly_rate="mean")
)

step_4_anomaly