# Importing the databases

In [1]:
import pandas as pd

df = pd.read_csv("../data/df_final_demo.txt")
final_demo_df = df.copy()

final_demo_df

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,836976,6.0,73.0,60.5,U,2.0,45105.30,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.30,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0
...,...,...,...,...,...,...,...,...,...
70604,7993686,4.0,56.0,38.5,U,3.0,1411062.68,5.0,5.0
70605,8981690,12.0,148.0,31.0,M,2.0,101867.07,6.0,6.0
70606,333913,16.0,198.0,61.5,F,2.0,40745.00,3.0,3.0
70607,1573142,21.0,255.0,68.0,M,3.0,475114.69,4.0,4.0


In [None]:
import pandas as pd

df = pd.read_csv("../data/df_final_web_data_pt_1.txt")
final_web_data_1_df = df.copy()

final_web_data_1_df

In [None]:
import pandas as pd

df = pd.read_csv("../data/df_final_web_data_pt_2.txt")
final_web_data_2_df = df.copy()

final_web_data_2_df

In [None]:
import pandas as pd

df = pd.read_csv("../data/df_final_experiment_clients.txt")
experiment_clients_df = df.copy()

experiment_clients_df

# Sanity check of the databases

### Important notes

Things to pay attention to while merging:
- Make sure that reach client_id is either in the control group, or the test group, but not both.
- One client_id can have multiple visitor_id's, but not the other way around. Visitor_id should have no duplicates.
- We merge dataframes ON client_id, which is the common denominator column of the three tables.

How to define the time spent per step:
- Each row provide the timestamp of the client initiating a step
- therefore, we will need to group per client_id and sort the values of time in an natural order
- Then we need to create two more columns, one for "duration", one for "success" - a boolean column that specifies whether the client proceeded or did a step back (error) at each step.
- Lastly, we need to create a column for our most important KPI, which is "conversion". That means, that a customer has proceeded in all steps and finalized the confirmation.

However, here are some important biases we need to account for:
- Session fragmentation -> Create a column for "SESSIONS" per client

A single client producing several visitor_id values within the same experiment window may represent broken sessions rather than distinct attempts. The simple check is: count visitor_id per client_id. If most clients have one and a few have many, inspect their time ordering. If multiple visitor_id values overlap in time, they likely represent a single attempt. A simple rule:
If two visitor_ids within the same client_id occur less than ~5 minutes apart and both start at step 1, you can treat them as the same attempt. If not, leave them separate.

- Inconsistent step ordering.

For each client_id, sort by date_time and verify that process_step never jumps backward by more than one. Small backward moves usually indicate page refreshes. Large jumps indicate noise.
We should flag sequences where process_step is not monotonically increasing and either exclude them or report them as noisy.

- Temporal truncation.

Our experiment has a fixed end date. Any visit_id whose final event occurs near that boundary might not have had time to finish. Compute the time difference between the last observed step and the experiment end. If the gap is very small, treat the session as incomplete by truncation rather than failure. You can either exclude them or keep them but acknowledge the ambiguity.

- Arm misclassification.

Each client_id appears exactly once and in exactly one group in the experiment file. If duplicates appear or if a client_id in the web logs is missing from the experiment file, flag and exclude.




### Step 1: Experiment data - Load and sanity-check each table.

Verify row counts, missing client_id, duplicate client_id in the experiment roster, and duplicate visitor_id in the web logs. This establishes whether the dataset is even suitable for merging.

### Step 2: Experiment data - Validate absence of arm misclassification.

Each client_id appears exactly once and in exactly one group in the experiment file. If duplicates appear or if a client_id in the web logs is missing from the experiment file, flag and exclude.


In [None]:
#Check if there are any duplicates in experiment dataFrame
experiment_clients_df[experiment_clients_df.duplicated()]

In [None]:
#Check the unique vlaues in the experiment dataframe
experiment_clients_df["client_id"].nunique()

In [None]:
#Check the unique vlaues in the web data 1 dataframe
final_web_data_1_df["client_id"].nunique()

In [None]:
#Check the unique vlaues in the web data 1 dataframe
final_web_data_2_df["client_id"].nunique()

In [None]:
#Combined the web data 1 and web data 2 data frames 
combined_web_data = (
    pd.concat([final_web_data_1_df, final_web_data_2_df])
    .drop_duplicates(subset="client_id", keep = "first")
    .reset_index(drop=True)
)
combined_web_data

In [None]:
#Get the client ids only in combined data frame but not present in experiment data frame
ids_only_in_df1 = combined_web_data.loc[
    ~combined_web_data["client_id"].isin(experiment_clients_df["client_id"]),
    "client_id"
].unique()
ids_only_in_df1

### Step 3: Web logs data - Inspect session multiplicity

Group web logs by client_id and count distinct visitor_id. If most clients have one and a minority have many, keep visitor_id as the unit of analysis. Only collapse visitor_id when two IDs begin a step-1 sequence within minutes of each other.

### Step 4: Web logs data - Account for Session Fragmentation - Build "session" timelines

For each (client_id, visitor_id), sort by date_time. Check monotonicity of process_step. Minor regressions can be tolerated; major reversals get flagged and excluded

### Step 5: Web logs data - Compute step durations and success-level outcomes.

Within each sorted sequence, compute time between steps, derive a success flag per step, and classify the visitor_id as converted or not.

### Step 6: Temporal truncation

Any attempt whose last timestamp sits very close to the experiment’s end can be marked ambiguous. We may exclude them or keep them with a clear note that their status is censored.

# Merging the databases

Merge on client_id, using the experiment client_id as the base, since these are the only customers that matter.

We prefer "left" merge instead of "inner" because:
how="left" keeps all experiment clients, including:
- those who never visited (web columns NaN)
- those missing demographics (demo columns NaN)

An inner merge would silently drop:
- assigned clients with no web activity
- assigned clients missing demographics

For an A/B test, dropping assigned-but-inactive clients biases completion rates, so left is preferable. We can always later filter to “clients with web activity and complete demographics” explicitly, instead of letting the join hide them.

In [None]:
 # exp_demo = experiment.merge(demo, on="client_id", how="left")

# then

# web = pd.concat([web_pt1, web_pt2], ignore_index=True)

# and lastly

# full = exp_demo.merge(web_data, on="client_id", how="left")


- KPI #8: Anomaly rate: irregular step jumps or session fragmentation, indicate improper use of the UI.

In [1]:
import pandas as pd

df = pd.read_csv("../data/final_per_step_dataset.csv")
final_per_step_df = df.copy()
final_per_step_df.head(5)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,process_step_num,step_diff,step_anomaly,next_time,duration_seconds,...,merged_session_id,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,0,,False,2017-04-15 12:58:03,7.0,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
1,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,1,1.0,False,2017-04-15 12:58:35,32.0,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
2,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,2,1.0,False,2017-04-15 13:00:14,99.0,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
3,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,3,1.0,False,2017-04-15 13:00:34,20.0,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
4,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,4,1.0,False,,,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0


In [4]:
final_per_step_df

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,process_step_num,step_diff,step_anomaly,next_time,duration_seconds,...,merged_session_id,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,0,,False,2017-04-15 12:58:03,7.0,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
1,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,1,1.0,False,2017-04-15 12:58:35,32.0,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
2,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,2,1.0,False,2017-04-15 13:00:14,99.0,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
3,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,3,1.0,False,2017-04-15 13:00:34,20.0,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
4,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,4,1.0,False,,,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317118,9999729,834634258_21862004160,870243567_56915814033_814203,step_2,2017-05-08 16:08:40,2,1.0,False,2017-05-08 16:09:19,39.0,...,3,Test,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0
317119,9999729,834634258_21862004160,870243567_56915814033_814203,step_3,2017-05-08 16:09:19,3,1.0,False,2017-05-08 16:09:40,21.0,...,3,Test,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0
317120,9999729,834634258_21862004160,870243567_56915814033_814203,confirm,2017-05-08 16:09:40,4,1.0,False,,,...,3,Test,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0
317121,9999832,145538019_54444341400,472154369_16714624241_585315,start,2017-05-16 16:46:03,0,,False,2017-05-16 16:46:11,8.0,...,1,Test,23.0,281.0,49.0,F,2.0,431887.61,1.0,4.0


In [None]:
final_per_step_df = final_per_step_df.sort_values(
    by=["client_id", "session_id", "date_time"]
)

In [None]:
final_per_step_df["prev_step"] = (
    df.groupby(["client_id", "session_id"])["process_step"]
    .shift(1)
)

final_per_step_df["step_jump_anomaly"] = (
    (final_per_step_df["process_step"] - final_per_step_df["prev_step"]) > 1
)

In [None]:
final_per_step_df["backward_step_anomaly"] = (
    (final_per_step_df["process_step"] - final_per_step_df["prev_step"]) < 0
)

In [None]:
session_anomalies = (
    final_per_step_df.groupby(["client_id", "session_id"])[
        ["step_jump_anomaly", "backward_step_anomaly"]
    ]
    .any()
    .any(axis=1)
)

In [None]:
anomaly_rate = (
    session_anomalies.sum() / session_anomalies.count()
) * 100

anomaly_rate