In [12]:
import os
import sys

if "ORIGINAL_WORKING_DIRECTORY" not in globals():
    ORIGINAL_WORKING_DIRECTORY = os.getcwd()

current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
os.chdir(parent_directory)
sys.path.insert(0, os.getcwd())

import pandas as pd
from modules.data_processor import ExperimentProcessor
from modules.hypothesis_tester import determine_winner

In [13]:
data = pd.read_csv('./data/experiments_dataset.csv')
processor = ExperimentProcessor(data)

### Exclude events (BUY, CHECKOUT_1, CHECKOUT_2, CHECKOUT_3)

The following code shows that these events do not contain variants in their labels, so they are not considered experiments.

In [14]:
non_experiments = processor.get_non_experiments_data()
non_experiments.groupby(["event_name", "experiment_name", "variant_id"]).size()

event_name  experiment_name         variant_id
BUY         buyingflow/address_hub  3574           922
            buyingflow/secure_card  4612            31
            buyingflow/user-track   6796          1088
CHECKOUT_1  buyingflow/escWebMLA    2874          1607
            buyingflow/user-track   6796          2115
CHECKOUT_2  buyingflow/address_hub  3574          1196
            buyingflow/escWebMLA    2874             2
            buyingflow/user-track   6796          1786
CHECKOUT_3  buyingflow/address_hub  3574          1270
            buyingflow/user-track   6796          1270
dtype: int64

### Experiments in different events

There are three experiments in diferent events.

In [15]:
from datetime import datetime

experiments_df = processor.get_experimets_data()
experiments_df["timestamp"] = pd.to_datetime(experiments_df["timestamp"])
experiments_df["date"] = experiments_df["timestamp"].dt.date
filter_date = datetime.strptime("2021-08-02", "%Y-%m-%d")

In [16]:
filtered_data = experiments_df[experiments_df["date"] == filter_date.date()]
filtered_data.groupby(["experiment_name"])["event_name"].nunique().reset_index(name="num_events").sort_values(by="num_events", ascending=False).head(3)

Unnamed: 0,experiment_name,num_events
0,cookiesConsentBanner,2
2,frontend/assetsCdnDomainMLA,2
25,search/remove-ecn-tag,2


In [17]:
filtered_data[filtered_data["experiment_name"].isin(["cookiesConsentBanner","frontend/assetsCdnDomainMLA", "search/remove-ecn-tag"])].head()

Unnamed: 0,event_name,item_id,timestamp,experiment_name,variant_id,user_id,date
9,SEARCH,,2021-08-02 23:55:38.966000-04:00,search/remove-ecn-tag,4954,3204901,2021-08-02
13,SEARCH,,2021-08-02 23:55:38.966000-04:00,cookiesConsentBanner,DEFAULT,3204901,2021-08-02
14,SEARCH,,2021-08-02 23:55:38.966000-04:00,frontend/assetsCdnDomainMLA,DEFAULT,3204901,2021-08-02
19,PRODUCT,882352139.0,2021-08-02 23:55:51.673000-04:00,cookiesConsentBanner,DEFAULT,3204901,2021-08-02
24,PRODUCT,655266729.0,2021-08-02 23:56:16.083000-04:00,cookiesConsentBanner,DEFAULT,3204901,2021-08-02


### Duplicate users (independence)

There isn't independence in some experiments, there are clients  who are in more than one variant, which generates a lack of idependence in the experiment

In [18]:
independence_df = filtered_data.groupby(["user_id", "experiment_name"])["variant_id"].nunique().reset_index(name="variants").sort_values(by="variants", ascending=False)
independence_df[independence_df["variants"] > 1]

Unnamed: 0,user_id,experiment_name,variants
50888,854874,filters/sort-by-ranking,3
49907,837567,filters/sort-by-ranking,3
67002,2211591,vip/carousel-v2p-above-the-fold,3
81536,4150112,vip/carousel-v2p-above-the-fold,3
25877,384897,vip/carousel-v2p-above-the-fold,3
...,...,...,...
67004,2211591,vip/shippingCalculatorMigrationModalExperiment,2
65,208,vip/carousel-v2p-above-the-fold,2
34976,556518,searchbackend/seller-reputation-change,2
103031,7161251,filters/sort-by-ranking,2


In [21]:
def check_user_independence(data):
    experiment_variants = data.groupby(
        ["user_id", "experiment_name"]
    )["variant_id"].nunique()
    independent = all(experiment_variants == 1)
    return independent, experiment_variants

In [22]:
check_user_independence(filtered_data[filtered_data["experiment_name"] == "filters/sort-by-ranking"])

(False,
 user_id  experiment_name        
 59       filters/sort-by-ranking    1
 159      filters/sort-by-ranking    1
 208      filters/sort-by-ranking    1
 281      filters/sort-by-ranking    1
 311      filters/sort-by-ranking    1
                                    ..
 9977516  filters/sort-by-ranking    1
 9982841  filters/sort-by-ranking    1
 9990293  filters/sort-by-ranking    1
 9990751  filters/sort-by-ranking    1
 9992552  filters/sort-by-ranking    1
 Name: variant_id, Length: 5629, dtype: int64)

### Duplicate users in same variant

If the client is in the same experiment, it should be considered only once, avoiding inflating data.

In [47]:
duplicates = filtered_data.groupby(["user_id", "experiment_name", "variant_id"])["event_name"].count().reset_index(name="variants").sort_values(by="variants", ascending=False)
duplicates[duplicates["variants"] > 1]

Unnamed: 0,user_id,experiment_name,variant_id,variants
79829,3825011,cookiesConsentBanner,DEFAULT,541
79844,3825011,search/remove-ecn-tag,4954,487
39229,629742,cookiesConsentBanner,DEFAULT,429
49593,828076,cookiesConsentBanner,DEFAULT,371
3040,23982,cookiesConsentBanner,DEFAULT,344
...,...,...,...,...
79779,3800731,search/remove-ecn-tag,4954,2
10570,98037,vip/carousel-v2p-above-the-fold,6787,2
30621,473751,pdp/viewItemPageMigrationReturns,5208,2
77922,3572681,search/remove-ecn-tag,4954,2


In [24]:
import requests
from urllib.parse import quote

# Definir el ID del experimento y la fecha en el código
experiment_id = "filters/sort-by-ranking"
day = "2021-08-02 00"

# Codificar el experiment_id para que sea seguro en la URL
encoded_experiment_id = quote(experiment_id, safe='')

# Definir la URL del endpoint de la API
url = f"http://127.0.0.1:5000/experiment/{encoded_experiment_id}/result"
print(f"Request URL: {url}")

# Definir los parámetros de la solicitud
params = {
    "day": day
}

# Realizar la solicitud GET
response = requests.get(url, params=params)

# Imprimir la URL y el contenido de la respuesta antes de intentar convertirlo a JSON
print(f"Response Status Code: {response.status_code}")
print(f"Response Content: {response.text}")

# Imprimir el resultado
try:
    response.raise_for_status()  # Esto lanzará una excepción para códigos de estado HTTP 4xx/5xx
    data = response.json()
    print("API Response:")
    print(data)
except requests.exceptions.HTTPError as errh:
    print("Http Error:", errh)
except requests.exceptions.ConnectionError as errc:
    print("Error Connecting:", errc)
except requests.exceptions.Timeout as errt:
    print("Timeout Error:", errt)
except requests.exceptions.RequestException as err:
    print("OOps: Something Else", err)
except ValueError as err:
    print("Error decoding JSON:", err)

Request URL: http://127.0.0.1:5000/experiment/filters%2Fsort-by-ranking/result
Response Status Code: 200
Response Content: {
  "results": {
    "filters/sort-by-ranking": {
      "checks": {
        "experiment_independence": true,
        "p-val": 0.2292586630667692,
        "sample_size": {
          "6971": true,
          "6972": true,
          "7057": true
        },
        "significant_pval": false,
        "user_independence": false,
        "variation": {
          "6971": 0.08242846256770704,
          "6972": 0.09327716262975778,
          "7057": 0.08034954594907258
        }
      },
      "number_of_participants": 4972,
      "variants": [
        {
          "id": "7057",
          "number_of_purchases": 149
        },
        {
          "id": "6971",
          "number_of_purchases": 156
        },
        {
          "id": "6972",
          "number_of_purchases": 177
        }
      ],
      "winner": "6972"
    }
  }
}

API Response:
{'results': {'filters/sort-by-ran