# Importing the databases

In [1]:
import pandas as pd

df = pd.read_csv("../data/df_final_demo.txt")
final_demo_df = df.copy()

final_demo_df

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,836976,6.0,73.0,60.5,U,2.0,45105.30,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.30,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0
...,...,...,...,...,...,...,...,...,...
70604,7993686,4.0,56.0,38.5,U,3.0,1411062.68,5.0,5.0
70605,8981690,12.0,148.0,31.0,M,2.0,101867.07,6.0,6.0
70606,333913,16.0,198.0,61.5,F,2.0,40745.00,3.0,3.0
70607,1573142,21.0,255.0,68.0,M,3.0,475114.69,4.0,4.0


In [2]:
import pandas as pd

df = pd.read_csv("../data/df_final_web_data_pt_1.txt")
final_web_data_1_df = df.copy()

final_web_data_1_df

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04
...,...,...,...,...,...
343136,2443347,465784886_73090545671,136329900_10529659391_316129,confirm,2017-03-31 15:15:46
343137,2443347,465784886_73090545671,136329900_10529659391_316129,step_3,2017-03-31 15:14:53
343138,2443347,465784886_73090545671,136329900_10529659391_316129,step_2,2017-03-31 15:12:08
343139,2443347,465784886_73090545671,136329900_10529659391_316129,step_1,2017-03-31 15:11:37


In [3]:
import pandas as pd

df = pd.read_csv("../data/df_final_web_data_pt_2.txt")
final_web_data_2_df = df.copy()

final_web_data_2_df

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,763412,601952081_10457207388,397475557_40440946728_419634,confirm,2017-06-06 08:56:00
1,6019349,442094451_91531546617,154620534_35331068705_522317,confirm,2017-06-01 11:59:27
2,6019349,442094451_91531546617,154620534_35331068705_522317,step_3,2017-06-01 11:58:48
3,6019349,442094451_91531546617,154620534_35331068705_522317,step_2,2017-06-01 11:58:08
4,6019349,442094451_91531546617,154620534_35331068705_522317,step_1,2017-06-01 11:57:58
...,...,...,...,...,...
412259,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:46:10
412260,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:45:29
412261,9668240,388766751_9038881013,922267647_3096648104_968866,step_1,2017-05-24 18:44:51
412262,9668240,388766751_9038881013,922267647_3096648104_968866,start,2017-05-24 18:44:34


In [4]:
import pandas as pd

df = pd.read_csv("../data/df_final_experiment_clients.txt")
experiment_clients_df = df.copy()

experiment_clients_df

Unnamed: 0,client_id,Variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control
...,...,...
70604,2443347,
70605,8788427,
70606,266828,
70607,1266421,


# Sanity check of the databases

### Important notes

Things to pay attention to while merging:
- Make sure that reach client_id is either in the control group, or the test group, but not both.
- One client_id can have multiple visitor_id's, but not the other way around. Visitor_id should have no duplicates.
- We merge dataframes ON client_id, which is the common denominator column of the three tables.

How to define the time spent per step:
- Each row provide the timestamp of the client initiating a step
- therefore, we will need to group per client_id and sort the values of time in an natural order
- Then we need to create two more columns, one for "duration", one for "success" - a boolean column that specifies whether the client proceeded or did a step back (error) at each step.
- Lastly, we need to create a column for our most important KPI, which is "conversion". That means, that a customer has proceeded in all steps and finalized the confirmation.

However, here are some important biases we need to account for:
- Session fragmentation -> Create a column for "SESSIONS" per client

A single client producing several visitor_id values within the same experiment window may represent broken sessions rather than distinct attempts. The simple check is: count visitor_id per client_id. If most clients have one and a few have many, inspect their time ordering. If multiple visitor_id values overlap in time, they likely represent a single attempt. A simple rule:
If two visitor_ids within the same client_id occur less than ~5 minutes apart and both start at step 1, you can treat them as the same attempt. If not, leave them separate.

- Inconsistent step ordering.

For each client_id, sort by date_time and verify that process_step never jumps backward by more than one. Small backward moves usually indicate page refreshes. Large jumps indicate noise.
We should flag sequences where process_step is not monotonically increasing and either exclude them or report them as noisy.

- Temporal truncation.

Our experiment has a fixed end date. Any visit_id whose final event occurs near that boundary might not have had time to finish. Compute the time difference between the last observed step and the experiment end. If the gap is very small, treat the session as incomplete by truncation rather than failure. You can either exclude them or keep them but acknowledge the ambiguity.

- Arm misclassification.

Each client_id appears exactly once and in exactly one group in the experiment file. If duplicates appear or if a client_id in the web logs is missing from the experiment file, flag and exclude.




### Step 1: Experiment data - Load and sanity-check each table - Cleaning the demographics dataframe

Verify row counts, missing client_id, duplicate client_id in the experiment roster. This establishes whether the dataset is even suitable for merging.

In [66]:
#final_demo_v1 Table
print(final_demo_df.shape)
print(final_demo_df.columns)

(70609, 9)
Index(['client_id', 'clnt_tenure_yr', 'clnt_tenure_mnth', 'clnt_age', 'gendr',
       'num_accts', 'bal', 'calls_6_mnth', 'logons_6_mnth'],
      dtype='object')


In [67]:
final_demo_df.isna().sum()

client_id            0
clnt_tenure_yr      14
clnt_tenure_mnth    14
clnt_age            15
gendr               14
num_accts           14
bal                 14
calls_6_mnth        14
logons_6_mnth       14
dtype: int64

In [68]:
final_demo_df['client_id'].duplicated().sum()

np.int64(0)

In [69]:
final_demo_df['gendr'].value_counts(dropna=False)

gendr
U      24122
M      23724
F      22746
NaN       14
X          3
Name: count, dtype: int64

In [71]:
final_demo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70609 entries, 0 to 70608
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   client_id         70609 non-null  int64  
 1   clnt_tenure_yr    70595 non-null  float64
 2   clnt_tenure_mnth  70595 non-null  float64
 3   clnt_age          70594 non-null  float64
 4   gendr             70595 non-null  object 
 5   num_accts         70595 non-null  float64
 6   bal               70595 non-null  float64
 7   calls_6_mnth      70595 non-null  float64
 8   logons_6_mnth     70595 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 4.8+ MB


In [74]:
#Row counts
print("final_demo_df:", final_demo_df.shape)
print("final_web_data_1_df:", final_web_data_1_df.shape)
print("final_web_data_2_df:", final_web_data_2_df.shape)
print("experiment_clients_df :", experiment_clients_df.shape)

final_demo_df: (70609, 9)
final_web_data_1_df: (343141, 5)
final_web_data_2_df: (412264, 5)
experiment_clients_df : (70609, 2)


In [75]:
#They all have client_id column, next:
#Checking for missing values in client_id column
print("demo missing client_id:", final_demo_df["client_id"].isna().sum())
print("web1 missing client_id:", final_web_data_1_df["client_id"].isna().sum())
print("web2 missing client_id:", final_web_data_2_df["client_id"].isna().sum())
print("experiment missing client_id:", experiment_clients_df["client_id"].isna().sum())

demo missing client_id: 0
web1 missing client_id: 0
web2 missing client_id: 0
experiment missing client_id: 0


In [76]:
#Check the experiment groups (important preview)
experiment_clients_df["Variation"].value_counts(dropna=False)

Variation
Test       26968
Control    23532
NaN        20109
Name: count, dtype: int64

### Step 2: Experiment data - Validate absence of arm misclassification.

Each client_id appears exactly once and in exactly one group in the experiment file. If duplicates appear or if a client_id in the web logs is missing from the experiment file, flag and exclude.


In [35]:
arm_counts = (experiment_clients_df.groupby("client_id")["Variation"].nunique().reset_index(name="n_groups"))

bad_clients = arm_counts[arm_counts["n_groups"] > 1]

bad_clients.shape

(0, 2)

Group web logs by client_id and count distinct visitor_id. If most clients have one and a minority have many, keep visitor_id as the unit of analysis. Only collapse visitor_id when two IDs begin a step-1 sequence within minutes of each other.

### Step 4: Web logs data - Account for Session Fragmentation - Build "session" timelines - Compute step durations and success-level outcomes.

For each (client_id, visitor_id), sort by date_time. Check monotonicity of process_step. Minor regressions can be tolerated; major reversals get flagged and excluded. Within each sorted sequence, compute time between steps, derive a success flag per step, and classify the visitor_id as converted or not.

In [5]:
# starting by concatenating the two web data dataframes
import pandas as pd

final_web_data_df = pd.concat(
    [final_web_data_1_df, final_web_data_2_df],
    ignore_index=True
)

# we will use the drop.duplicates method to drop rows that have the exact same values across all columns.
final_web_data_df = final_web_data_df.drop_duplicates()

final_web_data_df.shape

(744641, 5)

In [6]:
# the drop duplicates dropped roughly 1.4% of the total data, which is an expected when logs are split across files or exported twice. It indicates repeated rows, not behavioral data loss.

In [7]:
final_web_data_df.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04


In [8]:
final_web_data_df["process_step"].value_counts()

process_step
start      234999
step_1     162797
step_2     132750
step_3     111589
confirm    102506
Name: count, dtype: int64

In [9]:
# now we will format the date_time in a way that we can easier work with it later.
final_web_data_df["date_time"] = pd.to_datetime(final_web_data_df["date_time"])
final_web_data_df.dtypes


client_id                int64
visitor_id              object
visit_id                object
process_step            object
date_time       datetime64[ns]
dtype: object

In [29]:
# now we will sort based on natural timeline sequence
final_web_data_df = final_web_data_df.sort_values(
    ["client_id", "visitor_id", "date_time"]
)
final_web_data_df.head(10)


Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,process_step_num,step_diff,step_anomaly
285515,169,201385055_71273495308,749567106_99161211863_557568,start,2017-04-12 20:19:36,0,,False
285514,169,201385055_71273495308,749567106_99161211863_557568,step_1,2017-04-12 20:19:45,1,1.0,False
285513,169,201385055_71273495308,749567106_99161211863_557568,step_2,2017-04-12 20:20:31,2,1.0,False
285512,169,201385055_71273495308,749567106_99161211863_557568,step_3,2017-04-12 20:22:05,3,1.0,False
285511,169,201385055_71273495308,749567106_99161211863_557568,confirm,2017-04-12 20:23:09,4,1.0,False
628456,336,64757908_3400128256,649044751_80905125055_554468,start,2017-06-01 07:26:55,0,,False
628331,336,64757908_3400128256,649044751_80905125055_554468,start,2017-06-01 07:42:43,0,0.0,False
442857,546,475037402_89828530214,731811517_9330176838_94847,start,2017-06-17 10:03:29,0,,False
442856,546,475037402_89828530214,731811517_9330176838_94847,step_1,2017-06-17 10:03:39,1,1.0,False
442855,546,475037402_89828530214,731811517_9330176838_94847,step_2,2017-06-17 10:03:52,2,1.0,False


In [11]:
# we need to convert the process_step series into a numercial one
step_map = {
    "start": 0,
    "step_1": 1,
    "step_2": 2,
    "step_3": 3,
    "confirm": 4,
}

final_web_data_df["process_step_num"] = final_web_data_df["process_step"].map(step_map)

final_web_data_df["process_step_num"].isna().sum()


np.int64(0)

In [13]:
# creating a new column that shows the step changes each time. We can accept any step change that is between -1 and +1, but not bigger sudden changes.
final_web_data_df["step_diff"] = (
    final_web_data_df
    .groupby(["client_id", "visitor_id"])["process_step_num"]
    .diff()
)
final_web_data_df[["client_id", "visitor_id", "process_step_num", "step_diff"]].head(10)


Unnamed: 0,client_id,visitor_id,process_step_num,step_diff
285515,169,201385055_71273495308,0,
285514,169,201385055_71273495308,1,1.0
285513,169,201385055_71273495308,2,1.0
285512,169,201385055_71273495308,3,1.0
285511,169,201385055_71273495308,4,1.0
628456,336,64757908_3400128256,0,
628331,336,64757908_3400128256,0,0.0
442857,546,475037402_89828530214,0,
442856,546,475037402_89828530214,1,1.0
442855,546,475037402_89828530214,2,1.0


In [17]:
final_web_data_df.shape

(744641, 7)

In [18]:
# now we will identify suspicious timelines where the steps jump backwards too far.
final_web_data_df["step_anomaly"] = final_web_data_df["step_diff"] < -1
final_web_data_df["step_anomaly"].value_counts()
final_web_data_df.head(5)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,process_step_num,step_diff,step_anomaly
285515,169,201385055_71273495308,749567106_99161211863_557568,start,2017-04-12 20:19:36,0,,False
285514,169,201385055_71273495308,749567106_99161211863_557568,step_1,2017-04-12 20:19:45,1,1.0,False
285513,169,201385055_71273495308,749567106_99161211863_557568,step_2,2017-04-12 20:20:31,2,1.0,False
285512,169,201385055_71273495308,749567106_99161211863_557568,step_3,2017-04-12 20:22:05,3,1.0,False
285511,169,201385055_71273495308,749567106_99161211863_557568,confirm,2017-04-12 20:23:09,4,1.0,False


In [19]:
final_web_data_df["step_anomaly"].describe()

count     744641
unique         2
top        False
freq      722287
Name: step_anomaly, dtype: object

In [20]:
# we get 19455. In typical clickstream data, one to three percent of attempts show structural anomalies. This falls within that range. 

In [21]:
# now we build the duration per attempt
clean_web_data_df = final_web_data_df.copy()

clean_web_data_df["next_time"] = (
    clean_web_data_df
    .groupby(["client_id", "visitor_id"])["date_time"]
    .shift(-1)
)

clean_web_data_df["duration_seconds"] = (
    clean_web_data_df["next_time"] - clean_web_data_df["date_time"]
).dt.total_seconds()

clean_web_data_df.head(5)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,process_step_num,step_diff,step_anomaly,next_time,duration_seconds
285515,169,201385055_71273495308,749567106_99161211863_557568,start,2017-04-12 20:19:36,0,,False,2017-04-12 20:19:45,9.0
285514,169,201385055_71273495308,749567106_99161211863_557568,step_1,2017-04-12 20:19:45,1,1.0,False,2017-04-12 20:20:31,46.0
285513,169,201385055_71273495308,749567106_99161211863_557568,step_2,2017-04-12 20:20:31,2,1.0,False,2017-04-12 20:22:05,94.0
285512,169,201385055_71273495308,749567106_99161211863_557568,step_3,2017-04-12 20:22:05,3,1.0,False,2017-04-12 20:23:09,64.0
285511,169,201385055_71273495308,749567106_99161211863_557568,confirm,2017-04-12 20:23:09,4,1.0,False,NaT,


In [27]:
# Create a simple per-step “success” flag.
# A step is a success if the next recorded step number is greater than or equal to the current one.
# A step is an error if the next step is lower.

clean_web_data_df["next_step"] = (
    clean_web_data_df
    .groupby(["client_id", "visitor_id"])["process_step_num"]
    .shift(-1)
)

clean_web_data_df["step_success"] = (
    clean_web_data_df["next_step"] >= clean_web_data_df["process_step_num"]
)

# Turn the boolean into “success / error” labels
clean_web_data_df["step_outcome"] = clean_web_data_df["step_success"].map({
    True: "success",
    False: "error"
})

clean_web_data_df.head(10)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,process_step_num,step_diff,step_anomaly,next_time,duration_seconds,next_step,step_success,step_outcome
285515,169,201385055_71273495308,749567106_99161211863_557568,start,2017-04-12 20:19:36,0,,False,2017-04-12 20:19:45,9.0,1.0,True,success
285514,169,201385055_71273495308,749567106_99161211863_557568,step_1,2017-04-12 20:19:45,1,1.0,False,2017-04-12 20:20:31,46.0,2.0,True,success
285513,169,201385055_71273495308,749567106_99161211863_557568,step_2,2017-04-12 20:20:31,2,1.0,False,2017-04-12 20:22:05,94.0,3.0,True,success
285512,169,201385055_71273495308,749567106_99161211863_557568,step_3,2017-04-12 20:22:05,3,1.0,False,2017-04-12 20:23:09,64.0,4.0,True,success
285511,169,201385055_71273495308,749567106_99161211863_557568,confirm,2017-04-12 20:23:09,4,1.0,False,NaT,,,False,error
628456,336,64757908_3400128256,649044751_80905125055_554468,start,2017-06-01 07:26:55,0,,False,2017-06-01 07:42:43,948.0,0.0,True,success
628331,336,64757908_3400128256,649044751_80905125055_554468,start,2017-06-01 07:42:43,0,0.0,False,NaT,,,False,error
442857,546,475037402_89828530214,731811517_9330176838_94847,start,2017-06-17 10:03:29,0,,False,2017-06-17 10:03:39,10.0,1.0,True,success
442856,546,475037402_89828530214,731811517_9330176838_94847,step_1,2017-06-17 10:03:39,1,1.0,False,2017-06-17 10:03:52,13.0,2.0,True,success
442855,546,475037402_89828530214,731811517_9330176838_94847,step_2,2017-06-17 10:03:52,2,1.0,False,2017-06-17 10:05:19,87.0,3.0,True,success


In [25]:
# however, what I don't like, is that for the last step "confirm", the step_outcome is "error" since there is no further step. 
# I believe that we should change that to "completed" instead, because "error" is misleading.
# Also, I will mark the last step of an uncompleted session as "drop-off"

# 1. Confirmation that ends the attempt → completed
mask_last_confirm = (
    (clean_web_data_df["process_step"] == "confirm")
    & clean_web_data_df["next_step"].isna()
)
clean_web_data_df.loc[mask_last_confirm, "step_outcome"] = "completed"

# 2. Non-confirmation that ends the attempt → dropoff
mask_last_non_confirm = (
    (clean_web_data_df["process_step"] != "confirm")
    & clean_web_data_df["next_step"].isna()
)
clean_web_data_df.loc[mask_last_non_confirm, "step_outcome"] = "dropoff"

clean_web_data_df.head(10)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,process_step_num,step_diff,step_anomaly,next_time,duration_seconds,next_step,step_success,step_outcome
285515,169,201385055_71273495308,749567106_99161211863_557568,start,2017-04-12 20:19:36,0,,False,2017-04-12 20:19:45,9.0,1.0,True,success
285514,169,201385055_71273495308,749567106_99161211863_557568,step_1,2017-04-12 20:19:45,1,1.0,False,2017-04-12 20:20:31,46.0,2.0,True,success
285513,169,201385055_71273495308,749567106_99161211863_557568,step_2,2017-04-12 20:20:31,2,1.0,False,2017-04-12 20:22:05,94.0,3.0,True,success
285512,169,201385055_71273495308,749567106_99161211863_557568,step_3,2017-04-12 20:22:05,3,1.0,False,2017-04-12 20:23:09,64.0,4.0,True,success
285511,169,201385055_71273495308,749567106_99161211863_557568,confirm,2017-04-12 20:23:09,4,1.0,False,NaT,,,False,completed
628456,336,64757908_3400128256,649044751_80905125055_554468,start,2017-06-01 07:26:55,0,,False,2017-06-01 07:42:43,948.0,0.0,True,success
628331,336,64757908_3400128256,649044751_80905125055_554468,start,2017-06-01 07:42:43,0,0.0,False,NaT,,,False,dropoff
442857,546,475037402_89828530214,731811517_9330176838_94847,start,2017-06-17 10:03:29,0,,False,2017-06-17 10:03:39,10.0,1.0,True,success
442856,546,475037402_89828530214,731811517_9330176838_94847,step_1,2017-06-17 10:03:39,1,1.0,False,2017-06-17 10:03:52,13.0,2.0,True,success
442855,546,475037402_89828530214,731811517_9330176838_94847,step_2,2017-06-17 10:03:52,2,1.0,False,2017-06-17 10:05:19,87.0,3.0,True,success


In [26]:
# Lastly, I will create a new dataframe that contains infomration about the converted attempts (sessions) per client_id
attempts_df = (
    clean_web_data_df
    .groupby(["client_id", "visitor_id"])
    .agg(
        converted = ("step_outcome", lambda s: (s == "completed").any()),
        first_time = ("date_time", "min"),
        last_time  = ("date_time", "max"),
        total_duration_sec = ("duration_seconds", "sum"),
        n_events = ("process_step_num", "count"),
        n_errors = ("step_outcome", lambda s: (s == "error").sum())
    )
    .reset_index()
)
attempts_df.head(10)

Unnamed: 0,client_id,visitor_id,converted,first_time,last_time,total_duration_sec,n_events,n_errors
0,169,201385055_71273495308,True,2017-04-12 20:19:36,2017-04-12 20:23:09,213.0,5,0
1,336,64757908_3400128256,False,2017-06-01 07:26:55,2017-06-01 07:42:43,948.0,2,0
2,546,475037402_89828530214,True,2017-06-17 10:03:29,2017-06-17 10:05:42,133.0,5,0
3,555,402506806_56087378777,True,2017-04-15 12:57:56,2017-04-15 13:00:34,158.0,5,0
4,647,66758770_53988066587,True,2017-04-12 15:41:28,2017-04-12 15:47:45,377.0,5,0
5,722,919259913_64837298108,True,2017-04-19 14:50:10,2017-04-19 15:00:09,599.0,9,2
6,786,439747392_30293505155,True,2017-06-01 11:00:22,2017-06-01 11:05:34,312.0,6,0
7,805,831412807_82548325803,False,2017-06-08 01:10:29,2017-06-17 19:23:20,843171.0,9,2
8,832,645971208_10952400740,True,2017-06-16 10:38:38,2017-06-16 10:40:52,134.0,7,1
9,934,810392784_45004760546,False,2017-04-18 02:36:30,2017-04-18 02:38:52,142.0,4,0


### Step 3: Web logs data - Inspect session multiplicity

In [47]:
sessions_per_client = (
    clean_web_data_df
    .groupby("client_id")["visitor_id"]
    .nunique()
    .reset_index(name="n_sessions")
)

# quick overview
sessions_per_client["n_sessions"].describe()
sessions_per_client["n_sessions"].value_counts().head(10)


n_sessions
1     110384
2       8407
3       1019
4        226
5         56
6         35
7         19
11         4
10         3
8          2
Name: count, dtype: int64

In [49]:
# For each client/session, get the first timestamp of that session
session_starts = (
    clean_web_data_df
    .groupby(["client_id", "visitor_id"])["date_time"]
    .min()
    .reset_index(name="session_start")
)
session_starts.head(5)

Unnamed: 0,client_id,visitor_id,session_start
0,169,201385055_71273495308,2017-04-12 20:19:36
1,336,64757908_3400128256,2017-06-01 07:26:55
2,546,475037402_89828530214,2017-06-17 10:03:29
3,555,402506806_56087378777,2017-04-15 12:57:56
4,647,66758770_53988066587,2017-04-12 15:41:28


In [50]:
# Sort these sessions within each client
session_starts = session_starts.sort_values(
    ["client_id", "session_start"]
)
session_starts.head(5)

Unnamed: 0,client_id,visitor_id,session_start
0,169,201385055_71273495308,2017-04-12 20:19:36
1,336,64757908_3400128256,2017-06-01 07:26:55
2,546,475037402_89828530214,2017-06-17 10:03:29
3,555,402506806_56087378777,2017-04-15 12:57:56
4,647,66758770_53988066587,2017-04-12 15:41:28


In [52]:
# Compute time gap to the next session of the same client
session_starts["next_start"] = (
    session_starts
    .groupby("client_id")["session_start"]
    .shift(-1)
)

session_starts["gap_minutes"] = (
    (session_starts["next_start"] - session_starts["session_start"])
    .dt.total_seconds() / 60
)
session_starts.head(5)

Unnamed: 0,client_id,visitor_id,session_start,next_start,gap_minutes
0,169,201385055_71273495308,2017-04-12 20:19:36,NaT,
1,336,64757908_3400128256,2017-06-01 07:26:55,NaT,
2,546,475037402_89828530214,2017-06-17 10:03:29,NaT,
3,555,402506806_56087378777,2017-04-15 12:57:56,NaT,
4,647,66758770_53988066587,2017-04-12 15:41:28,NaT,


In [53]:
session_starts.shape

(131899, 5)

In [62]:
# we only look at actual gaps between multiple sessions of the same client, only for clients with at least two sessions.
multi_gap = session_starts[session_starts["gap_minutes"].notna()]

multi_gap["gap_minutes"].describe()


count     11742.000000
mean      20284.527201
std       26451.806934
min           0.033333
25%        1811.250000
50%        8540.225000
75%       25982.187500
max      136843.450000
Name: gap_minutes, dtype: float64

In [55]:
# examine whether any gaps are very small, which signals fragmentation
multi_gap.sort_values("gap_minutes").head(10)


Unnamed: 0,client_id,visitor_id,session_start,next_start,gap_minutes
24877,1907196,208475750_15923508166,2017-04-05 17:04:44,2017-04-05 17:04:46,0.033333
121109,9188874,105001234_961608697,2017-04-11 19:17:07,2017-04-11 19:17:10,0.05
11657,891998,102933384_25931315710,2017-06-02 08:51:39,2017-06-02 08:51:50,0.183333
54610,4164130,796551985_44812305047,2017-04-24 14:44:01,2017-04-24 14:44:16,0.25
8374,638962,411980426_9322034038,2017-06-04 15:15:18,2017-06-04 15:15:54,0.6
124513,9441768,192834352_97978828430,2017-04-04 11:26:34,2017-04-04 11:27:13,0.65
75567,5754354,885779170_16001783484,2017-04-14 20:05:00,2017-04-14 20:05:43,0.716667
41459,3174146,988434922_65305956189,2017-04-07 09:36:59,2017-04-07 09:37:45,0.766667
12901,987938,567594156_20645500114,2017-04-05 16:36:35,2017-04-05 16:37:26,0.85
44609,3416779,653893638_83904245427,2017-04-08 18:22:05,2017-04-08 18:22:59,0.9


In [56]:
# Stage 1 – Group sessions within 5 minutes
# gap to previous session of same client, in minutes
session_starts["gap_prev_min"] = (
    session_starts
    .groupby("client_id")["session_start"]
    .diff()
    .dt.total_seconds() / 60
)
session_starts.head(5)

Unnamed: 0,client_id,visitor_id,session_start,next_start,gap_minutes,gap_prev_min
0,169,201385055_71273495308,2017-04-12 20:19:36,NaT,,
1,336,64757908_3400128256,2017-06-01 07:26:55,NaT,,
2,546,475037402_89828530214,2017-06-17 10:03:29,NaT,,
3,555,402506806_56087378777,2017-04-15 12:57:56,NaT,,
4,647,66758770_53988066587,2017-04-12 15:41:28,NaT,,


In [57]:
# Create a flag for “new session” and then a grouped session id
# a new merged session starts if:
# - first session for the client, or
# - gap from previous session > 5 minutes
session_starts["new_session"] = (
    session_starts["gap_prev_min"].isna()
    | (session_starts["gap_prev_min"] > 5)
)

# assign merged session ids per client
session_starts["merged_session_id"] = (
    session_starts
    .groupby("client_id")["new_session"]
    .cumsum()
)
session_starts.head(5)

Unnamed: 0,client_id,visitor_id,session_start,next_start,gap_minutes,gap_prev_min,new_session,merged_session_id
0,169,201385055_71273495308,2017-04-12 20:19:36,NaT,,,True,1
1,336,64757908_3400128256,2017-06-01 07:26:55,NaT,,,True,1
2,546,475037402_89828530214,2017-06-17 10:03:29,NaT,,,True,1
3,555,402506806_56087378777,2017-04-15 12:57:56,NaT,,,True,1
4,647,66758770_53988066587,2017-04-12 15:41:28,NaT,,,True,1


In [58]:
# Attach merged_session_id back onto your event-level web data
clean_web_data_df = clean_web_data_df.merge(
    session_starts[["client_id", "visitor_id", "merged_session_id"]],
    on=["client_id", "visitor_id"],
    how="left"
)
clean_web_data_df["merged_session_id"].isna().sum()


np.int64(0)

In [59]:
# From now on, we treat an attempt as for grouping (client_id, merged_session_id, instead of visitor_id
clean_web_data_df = clean_web_data_df.sort_values(
    ["client_id", "merged_session_id", "date_time"]
)


In [60]:
# now we recompute all columns like in step 4, but given the new order of things. We will just copy paste and combine all code from step 4.

# 1) Sort by merged attempt timeline
clean_web_data_df = clean_web_data_df.sort_values(
    ["client_id", "merged_session_id", "date_time"]
)

g = clean_web_data_df.groupby(["client_id", "merged_session_id"])

# 2) Recompute step diffs (ordering quality)
clean_web_data_df["step_diff"] = g["process_step_num"].diff()

# 3) Recompute next step + per-step success
clean_web_data_df["next_step"] = g["process_step_num"].shift(-1)
clean_web_data_df["step_success"] = clean_web_data_df["next_step"] >= clean_web_data_df["process_step_num"]

# 4) Recompute next time + durations
clean_web_data_df["next_time"] = g["date_time"].shift(-1)
clean_web_data_df["duration_seconds"] = (
    clean_web_data_df["next_time"] - clean_web_data_df["date_time"]
).dt.total_seconds()

# 5) Recompute anomaly flag (optional, keep if you want)
clean_web_data_df["step_anomaly"] = clean_web_data_df["step_diff"] < -1

# 6) Recompute step_outcome labels
clean_web_data_df["step_outcome"] = clean_web_data_df["step_success"].map({True: "success", False: "error"})

# Fix last row per merged session: completed vs dropoff
mask_last = clean_web_data_df["next_step"].isna()

mask_last_confirm = mask_last & (clean_web_data_df["process_step"] == "confirm")
clean_web_data_df.loc[mask_last_confirm, "step_outcome"] = "completed"

mask_last_non_confirm = mask_last & (clean_web_data_df["process_step"] != "confirm")
clean_web_data_df.loc[mask_last_non_confirm, "step_outcome"] = "dropoff"


clean_web_data_df.head(10)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,process_step_num,step_diff,step_anomaly,next_time,duration_seconds,next_step,step_success,step_outcome,merged_session_id
0,169,201385055_71273495308,749567106_99161211863_557568,start,2017-04-12 20:19:36,0,,False,2017-04-12 20:19:45,9.0,1.0,True,success,1
1,169,201385055_71273495308,749567106_99161211863_557568,step_1,2017-04-12 20:19:45,1,1.0,False,2017-04-12 20:20:31,46.0,2.0,True,success,1
2,169,201385055_71273495308,749567106_99161211863_557568,step_2,2017-04-12 20:20:31,2,1.0,False,2017-04-12 20:22:05,94.0,3.0,True,success,1
3,169,201385055_71273495308,749567106_99161211863_557568,step_3,2017-04-12 20:22:05,3,1.0,False,2017-04-12 20:23:09,64.0,4.0,True,success,1
4,169,201385055_71273495308,749567106_99161211863_557568,confirm,2017-04-12 20:23:09,4,1.0,False,NaT,,,False,completed,1
5,336,64757908_3400128256,649044751_80905125055_554468,start,2017-06-01 07:26:55,0,,False,2017-06-01 07:42:43,948.0,0.0,True,success,1
6,336,64757908_3400128256,649044751_80905125055_554468,start,2017-06-01 07:42:43,0,0.0,False,NaT,,,False,dropoff,1
7,546,475037402_89828530214,731811517_9330176838_94847,start,2017-06-17 10:03:29,0,,False,2017-06-17 10:03:39,10.0,1.0,True,success,1
8,546,475037402_89828530214,731811517_9330176838_94847,step_1,2017-06-17 10:03:39,1,1.0,False,2017-06-17 10:03:52,13.0,2.0,True,success,1
9,546,475037402_89828530214,731811517_9330176838_94847,step_2,2017-06-17 10:03:52,2,1.0,False,2017-06-17 10:05:19,87.0,3.0,True,success,1


In [61]:
clean_web_data_df.shape

(744641, 14)

### Step 5: Temporal truncation

Any attempt whose last timestamp sits very close to the experiment’s end can be marked ambiguous. We may exclude them or keep them with a clear note that their status is censored.

In [63]:
import pandas as pd

experiment_end = pd.Timestamp("2017-06-20 23:59:59")

# how many rows are after the end?
(clean_web_data_df["date_time"] > experiment_end).sum()


np.int64(0)

In [64]:
# Flag attempts that start too close to the experiment end
experiment_end = pd.Timestamp("2017-06-20 23:59:59")

attempt_starts = (
    clean_web_data_df
    .groupby(["client_id", "merged_session_id"])["date_time"]
    .min()
    .reset_index(name="attempt_start")
)

attempt_starts["minutes_to_end"] = (
    (experiment_end - attempt_starts["attempt_start"]).dt.total_seconds() / 60
)

attempt_starts["is_truncated_risk"] = attempt_starts["minutes_to_end"] < 5

attempt_starts["is_truncated_risk"].value_counts()


is_truncated_risk
False    131670
True          1
Name: count, dtype: int64

In [70]:
# I choose to not bother with that 1 attempt that goes out of limit, since it is too small to affect the results in any way. 

# Merging the databases

Merge on client_id, using the experiment client_id as the base, since these are the only customers that matter.

We prefer "left" merge instead of "inner" because:
how="left" keeps all experiment clients, including:
- those who never visited (web columns NaN)
- those missing demographics (demo columns NaN)

An inner merge would silently drop:
- assigned clients with no web activity
- assigned clients missing demographics

For an A/B test, dropping assigned-but-inactive clients biases completion rates, so left is preferable. We can always later filter to “clients with web activity and complete demographics” explicitly, instead of letting the join hide them.

In [77]:
exp_demo_df = experiment_clients_df.merge(
    final_demo_df,
    on="client_id",
    how="left"
)
exp_demo_df.head(5)

Unnamed: 0,client_id,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,9988021,Test,5.0,64.0,79.0,U,2.0,189023.86,1.0,4.0
1,8320017,Test,22.0,274.0,34.5,M,2.0,36001.9,5.0,8.0
2,4033851,Control,12.0,149.0,63.5,M,2.0,142642.26,5.0,8.0
3,1982004,Test,6.0,80.0,44.5,U,2.0,30231.76,1.0,4.0
4,9294070,Control,5.0,70.0,29.0,U,2.0,34254.54,0.0,3.0


In [78]:
exp_demo_df.shape

(70609, 10)

In [99]:
attempts_df = (clean_web_data_df.groupby(["client_id", "merged_session_id"]).agg(
        converted=("step_outcome", lambda s: (s == "completed").any()),
        first_time=("date_time", "min"),
        last_time=("date_time", "max"),
        total_duration_sec=("duration_seconds", "sum"),
        n_events=("process_step_num", "count"),
        n_attempts=("merged_session_id", "nunique"),
        n_errors=("step_outcome", lambda s: (s == "error").sum()),
        any_anomaly=("step_anomaly", "any"),
    )
    .reset_index()
)
attempts_df.head(5)

Unnamed: 0,client_id,merged_session_id,converted,first_time,last_time,total_duration_sec,n_events,n_attempts,n_errors,any_anomaly
0,169,1,True,2017-04-12 20:19:36,2017-04-12 20:23:09,213.0,5,1,0,False
1,336,1,False,2017-06-01 07:26:55,2017-06-01 07:42:43,948.0,2,1,0,False
2,546,1,True,2017-06-17 10:03:29,2017-06-17 10:05:42,133.0,5,1,0,False
3,555,1,True,2017-04-15 12:57:56,2017-04-15 13:00:34,158.0,5,1,0,False
4,647,1,True,2017-04-12 15:41:28,2017-04-12 15:47:45,377.0,5,1,0,False


In [100]:
final_exp_df = exp_demo_df.merge(attempts_df, on="client_id", how="left")

final_exp_df.head(5)

Unnamed: 0,client_id,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,merged_session_id,converted,first_time,last_time,total_duration_sec,n_events,n_attempts,n_errors,any_anomaly
0,9988021,Test,5.0,64.0,79.0,U,2.0,189023.86,1.0,4.0,1,False,2017-04-12 16:57:27,2017-04-17 15:27:07,426580.0,15,1,2,False
1,8320017,Test,22.0,274.0,34.5,M,2.0,36001.9,5.0,8.0,1,True,2017-04-05 13:08:06,2017-04-05 13:10:05,119.0,5,1,0,False
2,4033851,Control,12.0,149.0,63.5,M,2.0,142642.26,5.0,8.0,1,True,2017-04-05 12:04:00,2017-04-05 12:29:03,1503.0,15,1,3,True
3,1982004,Test,6.0,80.0,44.5,U,2.0,30231.76,1.0,4.0,1,True,2017-04-17 11:59:42,2017-04-17 12:02:42,180.0,5,1,0,False
4,9294070,Control,5.0,70.0,29.0,U,2.0,34254.54,0.0,3.0,1,False,2017-04-17 11:16:13,2017-04-17 11:16:28,15.0,2,1,0,False


In [84]:
final_exp_df.shape

(78723, 18)

In [85]:
final_exp_df.isna().sum()

client_id                 0
Variation             22454
clnt_tenure_yr           18
clnt_tenure_mnth         18
clnt_age                 19
gendr                    18
num_accts                18
bal                      18
calls_6_mnth             18
logons_6_mnth            18
merged_session_id         0
converted                 0
first_time                0
last_time                 0
total_duration_sec        0
n_events                  0
n_errors                  0
any_anomaly               0
dtype: int64

# Data Cleaning

In [86]:
# to be on the safe side, we create a copy of the final result and then we drop the NaN in variation
final_df = final_exp_df.copy()

In [87]:
final_df = final_df[final_df["Variation"].notna()]
final_df["Variation"].isna().sum()

np.int64(0)

In [89]:
final_df.shape

(56269, 18)

In [90]:
final_df.isna().sum()

client_id              0
Variation              0
clnt_tenure_yr        16
clnt_tenure_mnth      16
clnt_age              17
gendr                 16
num_accts             16
bal                   16
calls_6_mnth          16
logons_6_mnth         16
merged_session_id      0
converted              0
first_time             0
last_time              0
total_duration_sec     0
n_events               0
n_errors               0
any_anomaly            0
dtype: int64

In [92]:
demo_cols = [
    "clnt_tenure_yr", "clnt_tenure_mnth", "clnt_age", "gendr",
    "num_accts", "bal", "calls_6_mnth", "logons_6_mnth"
]

rows_with_nan = final_df[final_df[demo_cols].isna().any(axis=1)]
rows_with_nan.shape

final_df[demo_cols].isna().sum()


clnt_tenure_yr      16
clnt_tenure_mnth    16
clnt_age            17
gendr               16
num_accts           16
bal                 16
calls_6_mnth        16
logons_6_mnth       16
dtype: int64

In [93]:
final_df = final_df[~final_df[demo_cols].isna().any(axis=1)].copy()
final_df.isna().sum()

client_id             0
Variation             0
clnt_tenure_yr        0
clnt_tenure_mnth      0
clnt_age              0
gendr                 0
num_accts             0
bal                   0
calls_6_mnth          0
logons_6_mnth         0
merged_session_id     0
converted             0
first_time            0
last_time             0
total_duration_sec    0
n_events              0
n_errors              0
any_anomaly           0
dtype: int64

In [94]:
final_df.shape

(56252, 18)

In [95]:
# exporting
# CSV 
final_df.to_csv("final_experiment_dataset.csv", index=False)

# Parquet (for notebooks)
final_df.to_parquet("final_experiment_dataset.parquet", index=False)


# Client Behavior Analysis

For the client behavior analysis we will NOT use the final_df dataframe that accounts only for the clients that participated in the experiment. That dataframe has a time window of 3 months and 5 days, whereas our demographics dataframe gives us values and behaviors over 6 months. Therefore, we will use a dataframe that is the merge between demographics and web_data dataframes.

That means:
- Without filtering to experiment roster.
- No Test/Control, no KPIs.

In [101]:
# Aggregate web logs to one row per client_id (behavior features)
client_web = (
    clean_web_data_df
    .groupby("client_id")
    .agg(
        n_attempts=("merged_session_id", "nunique"),
        n_events=("process_step", "count"),
        n_errors_per_attempt=("step_outcome", lambda s: (s == "error").sum()),  # crude; refine if needed
        converted=("step_outcome", lambda s: (s == "completed").any()),
        total_duration_sec=("duration_seconds", "sum"),
    )
    .reset_index()
)


In [102]:
# Merge demo + client_web
demo_behavior_df = final_demo_df.merge(client_web, on="client_id", how="left")
demo_behavior_df.head(10)

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth,n_attempts,n_events,n_errors_per_attempt,converted,total_duration_sec
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0,1,11,0,True,348758.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.3,6.0,9.0,1,6,0,True,295.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0,2,5,0,False,117.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0,1,1,0,False,0.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0,1,1,0,False,0.0
5,3727881,5.0,71.0,30.5,U,2.0,23915.6,0.0,3.0,1,7,1,True,390.0
6,272934,5.0,66.0,58.5,U,2.0,27021.42,2.0,5.0,1,5,0,True,236.0
7,388801,30.0,361.0,57.5,M,5.0,522498.72,1.0,4.0,1,5,0,True,589.0
8,285619,30.0,369.0,67.5,M,2.0,299388.72,3.0,6.0,1,9,2,True,280.0
9,8198645,15.0,189.0,54.5,F,2.0,382303.83,6.0,9.0,1,5,0,True,582.0


In [109]:
demo_behavior_df.shape

(70609, 14)

In [122]:
demo_behavior_df.isna().sum()

client_id                0
clnt_tenure_yr          14
clnt_tenure_mnth        14
clnt_age                15
gendr                   14
num_accts               14
bal                     14
calls_6_mnth            14
logons_6_mnth           14
n_attempts               0
n_events                 0
n_errors_per_attempt     0
converted                0
total_duration_sec       0
dtype: int64

In [123]:
demo_behavior_cols = [
    "clnt_tenure_yr", "clnt_tenure_mnth", "clnt_age", "gendr",
    "num_accts", "bal", "calls_6_mnth", "logons_6_mnth"
]

rows_with_nan = demo_behavior_df[demo_behavior_df[demo_behavior_cols].isna().any(axis=1)]
rows_with_nan.shape

demo_behavior_df[demo_behavior_cols].isna().sum()

clnt_tenure_yr      14
clnt_tenure_mnth    14
clnt_age            15
gendr               14
num_accts           14
bal                 14
calls_6_mnth        14
logons_6_mnth       14
dtype: int64

In [124]:
demo_behavior_df = demo_behavior_df[~demo_behavior_df[demo_behavior_cols].isna().any(axis=1)].copy()
demo_behavior_df.isna().sum()

client_id               0
clnt_tenure_yr          0
clnt_tenure_mnth        0
clnt_age                0
gendr                   0
num_accts               0
bal                     0
calls_6_mnth            0
logons_6_mnth           0
n_attempts              0
n_events                0
n_errors_per_attempt    0
converted               0
total_duration_sec      0
dtype: int64

In [126]:
demo_behavior_df.shape

(70594, 14)

In [127]:
# exporting
# CSV 
demo_behavior_df.to_csv("demo_behavior_dataset.csv", index=False)

# Parquet (for notebooks)
demo_behavior_df.to_parquet("demo_behavior_dataset.parquet", index=False)

# Success Indicators

For this chapter we have two dataframes we are going to use.
- The final_df, created before, which is the dataframe that uses the experiment dataframe as the core on which we merge left, therefore we merge on client_id containing one row per client_id, and showing aggregate values for the rest of our KPIs.
- The following final_per_step_df, which is the dataframe that uses the web_data dataframe as the core on which we merge left, which is a dataframe containing all process_steps and attempts per client_id, useful to generate KPIs related to per step performance of the two groups.

In [111]:
# keep only experiment columns you need (rename if your arm column differs)
exp_cols = ["client_id", "Variation"]  # Variation should be control/test

final_per_step_df = (
    clean_web_data_df
    .merge(experiment_clients_df[exp_cols], on="client_id", how="inner")
    .merge(final_demo_df, on="client_id", how="left")
)


In [112]:
final_per_step_df.shape

(443897, 23)

In [113]:
final_per_step_df["Variation"].isna().sum()

np.int64(126662)

In [116]:
# we will drop the values where Variation is NaN, since we are only interested in the clients that clearly participated in the experiment
final_per_step_df = final_per_step_df[final_per_step_df["Variation"].notna()].copy()

In [117]:
final_per_step_df.head(5)

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,process_step_num,step_diff,step_anomaly,next_time,duration_seconds,...,merged_session_id,Variation,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
5,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,0,,False,2017-04-15 12:58:03,7.0,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
6,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,1,1.0,False,2017-04-15 12:58:35,32.0,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
7,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,2,1.0,False,2017-04-15 13:00:14,99.0,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
8,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,3,1.0,False,2017-04-15 13:00:34,20.0,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
9,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,4,1.0,False,NaT,,...,1,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0


In [118]:
final_per_step_df.columns

Index(['client_id', 'visitor_id', 'visit_id', 'process_step', 'date_time',
       'process_step_num', 'step_diff', 'step_anomaly', 'next_time',
       'duration_seconds', 'next_step', 'step_success', 'step_outcome',
       'merged_session_id', 'Variation', 'clnt_tenure_yr', 'clnt_tenure_mnth',
       'clnt_age', 'gendr', 'num_accts', 'bal', 'calls_6_mnth',
       'logons_6_mnth'],
      dtype='object')

In [120]:
final_per_step_df["Variation"].value_counts()


Variation
Test       176699
Control    140536
Name: count, dtype: int64

In [121]:
final_per_step_df["Variation"].isna().sum()

np.int64(0)

Before we start with analyzing the KPIs per group, we need to make sure that the analysis is not biased, therefore, we will do some basic demographic check for the two groups.

In [128]:
final_per_step_df.isna().sum()

client_id                0
visitor_id               0
visit_id                 0
process_step             0
date_time                0
process_step_num         0
step_diff            56269
step_anomaly             0
next_time            56269
duration_seconds     56269
next_step            56269
step_success             0
step_outcome             0
merged_session_id        0
Variation                0
clnt_tenure_yr         100
clnt_tenure_mnth       100
clnt_age               112
gendr                  100
num_accts              100
bal                    100
calls_6_mnth           100
logons_6_mnth          100
dtype: int64

In [129]:
demo_per_step_cols = [
    "clnt_tenure_yr", "clnt_tenure_mnth", "clnt_age", "gendr",
    "num_accts", "bal", "calls_6_mnth", "logons_6_mnth"
]

final_per_step_df[demo_per_step_cols].isna().any(axis=1).sum()


np.int64(112)

In [130]:
# we will drop those 112 rows from our dataset, since they are just adding noise.
final_per_step_df = final_per_step_df[
    ~final_per_step_df[demo_per_step_cols].isna().any(axis=1)
].copy()


In [131]:
final_per_step_df.isna().sum()

client_id                0
visitor_id               0
visit_id                 0
process_step             0
date_time                0
process_step_num         0
step_diff            56252
step_anomaly             0
next_time            56252
duration_seconds     56252
next_step            56252
step_success             0
step_outcome             0
merged_session_id        0
Variation                0
clnt_tenure_yr           0
clnt_tenure_mnth         0
clnt_age                 0
gendr                    0
num_accts                0
bal                      0
calls_6_mnth             0
logons_6_mnth            0
dtype: int64

In [None]:
# exporting
# CSV 
final_per_step_df.to_csv("final_per_step_dataset.csv", index=False)

# Parquet (for notebooks)
final_per_step_df.to_parquet("final_per_step_dataset.parquet", index=False)