In [1]:
import pandas as pd

df1 = pd.read_csv("df_final_web_data_pt_1.txt", sep=",")
df2 = pd.read_csv("df_final_web_data_pt_2.txt", sep=",")

In [2]:
df1.head()
df2.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,763412,601952081_10457207388,397475557_40440946728_419634,confirm,2017-06-06 08:56:00
1,6019349,442094451_91531546617,154620534_35331068705_522317,confirm,2017-06-01 11:59:27
2,6019349,442094451_91531546617,154620534_35331068705_522317,step_3,2017-06-01 11:58:48
3,6019349,442094451_91531546617,154620534_35331068705_522317,step_2,2017-06-01 11:58:08
4,6019349,442094451_91531546617,154620534_35331068705_522317,step_1,2017-06-01 11:57:58


In [3]:
# Combine both parts into a single DataFrame
df_web = pd.concat([df1, df2], ignore_index=True)

# Convert date_time to proper datetime dtype
df_web["date_time"] = pd.to_datetime(df_web["date_time"])

# Sort rows by visit_id and timestamp to keep the process sequence intact
df_web.sort_values(by=["visit_id", "date_time"], inplace=True)

In [4]:
# Remove exact duplicates (all columns identical)
df_web.drop_duplicates(inplace=True)

# Compute the step-to-step time difference within each visit
df_web["time_diff"] = df_web.groupby("visit_id")["date_time"].diff()

# Map textual process_step labels to an ordered integer column
step_order_map = {"start": 0, "step_1": 1, "step_2": 2, "step_3": 3, "confirm": 4}
df_web["step_order"] = df_web["process_step"].map(step_order_map)

# Calculate total journey time per visit (max – min timestamp)
journey_stats = (
    df_web.groupby("visit_id")["date_time"]
    .agg(["min", "max"])
    .assign(duration=lambda x: x["max"] - x["min"])
)

In [5]:
# Display quick sanity checks
print("Missing values per column:")
print(df_web.isnull().sum())
print("\nFirst five rows:")
display(df_web.head())
print("\nJourney duration sample:")
display(journey_stats.head())

Missing values per column:
client_id            0
visitor_id           0
visit_id             0
process_step         0
date_time            0
time_diff       158095
step_order           0
dtype: int64

First five rows:


Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,time_diff,step_order
108614,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:22:17,NaT,4
108613,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:23:09,0 days 00:00:52,4
447951,9056452,306992881_89423906595,1000165_4190026492_760066,start,2017-06-04 01:07:29,NaT,0
447950,9056452,306992881_89423906595,1000165_4190026492_760066,step_1,2017-06-04 01:07:32,0 days 00:00:03,1
447949,9056452,306992881_89423906595,1000165_4190026492_760066,step_2,2017-06-04 01:07:56,0 days 00:00:24,2



Journey duration sample:


Unnamed: 0_level_0,min,max,duration
visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100012776_37918976071_457913,2017-04-26 13:22:17,2017-04-26 13:23:09,0 days 00:00:52
1000165_4190026492_760066,2017-06-04 01:07:29,2017-06-04 01:09:50,0 days 00:02:21
100019538_17884295066_43909,2017-04-09 16:20:56,2017-04-09 16:24:58,0 days 00:04:02
100022086_87870757897_149620,2017-05-23 20:44:01,2017-05-23 20:47:01,0 days 00:03:00
100030127_47967100085_936361,2017-03-22 11:07:49,2017-03-22 11:07:49,0 days 00:00:00


In [6]:
df_web['step_diff'] = df_web.groupby('visit_id')['step_order'].diff()
df_web[['visit_id', 'process_step', 'step_order', 'step_diff']].head(10)

Unnamed: 0,visit_id,process_step,step_order,step_diff
108614,100012776_37918976071_457913,confirm,4,
108613,100012776_37918976071_457913,confirm,4,0.0
447951,1000165_4190026492_760066,start,0,
447950,1000165_4190026492_760066,step_1,1,1.0
447949,1000165_4190026492_760066,step_2,2,1.0
447948,1000165_4190026492_760066,step_3,3,1.0
447947,1000165_4190026492_760066,confirm,4,1.0
240562,100019538_17884295066_43909,start,0,
240561,100019538_17884295066_43909,step_1,1,1.0
240560,100019538_17884295066_43909,step_2,2,1.0


In [7]:
# Check the number of rows with backward steps (errors)
print("Number of rows with backward steps (errors):")
print(df_web[df_web['step_diff'] < 0].shape[0])

# Display a few rows with backward steps to confirm
print("\nSample rows with backward steps:")
print(df_web[df_web['step_diff'] < 0][['visit_id', 'process_step', 'step_order', 'step_diff']].head())

Number of rows with backward steps (errors):
63798

Sample rows with backward steps:
                            visit_id process_step  step_order  step_diff
240559   100019538_17884295066_43909       step_1           1       -1.0
240557   100019538_17884295066_43909        start           0       -1.0
603155  100034012_10293842356_627828        start           0       -1.0
27510   100037962_47432393712_705583        start           0       -1.0
1609    100057941_88477660212_944512       step_1           1       -2.0


In [8]:
# Mark rows as errors where the user moved backward (step_diff < 0)
df_web['is_error'] = df_web['step_diff'] < 0

# Display a few rows to confirm the is_error column
print("Sample rows with is_error column:")
print(df_web[['visit_id', 'process_step', 'step_order', 'step_diff', 'is_error']].head(10))

Sample rows with is_error column:
                            visit_id process_step  step_order  step_diff  \
108614  100012776_37918976071_457913      confirm           4        NaN   
108613  100012776_37918976071_457913      confirm           4        0.0   
447951     1000165_4190026492_760066        start           0        NaN   
447950     1000165_4190026492_760066       step_1           1        1.0   
447949     1000165_4190026492_760066       step_2           2        1.0   
447948     1000165_4190026492_760066       step_3           3        1.0   
447947     1000165_4190026492_760066      confirm           4        1.0   
240562   100019538_17884295066_43909        start           0        NaN   
240561   100019538_17884295066_43909       step_1           1        1.0   
240560   100019538_17884295066_43909       step_2           2        1.0   

        is_error  
108614     False  
108613     False  
447951     False  
447950     False  
447949     False  
447948     Fals

In [9]:
# Mark each visit_id as completed if it contains a 'confirm' step
df_web['completed'] = df_web['visit_id'].isin(
    df_web[df_web['process_step'] == 'confirm']['visit_id']
)
df_web[['visit_id', 'process_step', 'completed']].head(10)

Unnamed: 0,visit_id,process_step,completed
108614,100012776_37918976071_457913,confirm,True
108613,100012776_37918976071_457913,confirm,True
447951,1000165_4190026492_760066,start,True
447950,1000165_4190026492_760066,step_1,True
447949,1000165_4190026492_760066,step_2,True
447948,1000165_4190026492_760066,step_3,True
447947,1000165_4190026492_760066,confirm,True
240562,100019538_17884295066_43909,start,True
240561,100019538_17884295066_43909,step_1,True
240560,100019538_17884295066_43909,step_2,True


In [10]:
# Calculate completion rate
total_visits = df_web['visit_id'].nunique()
completed_visits = df_web[df_web['process_step'] == 'confirm']['visit_id'].nunique()
completion_rate = completed_visits / total_visits
completion_rate

0.5681773617128941

In [11]:
# Calculate average time spent on each step
avg_time_per_step = df_web.groupby('process_step')['time_diff'].mean().sort_index()
avg_time_per_step

process_step
confirm   0 days 00:02:11.229566330
start     0 days 00:02:41.297419731
step_1    0 days 00:00:39.611460265
step_2    0 days 00:00:45.912095900
step_3    0 days 00:01:40.029564193
Name: time_diff, dtype: timedelta64[ns]

In [12]:
# Step 1: Mark rows where the user moved backwards (considered an error)
df_web['is_error'] = df_web['step_diff'] < 0

# Step 2: Count unique visit_ids that had at least one error
error_visits = df_web[df_web['is_error']]['visit_id'].nunique()

# Step 3: Count total unique visit_ids
total_visits = df_web['visit_id'].nunique()

# Step 4: Calculate the error rate
error_rate = error_visits / total_visits
error_rate

0.2559789999683734

In [13]:
# Load experiment client group
df_experiment = pd.read_csv("df_final_experiment_clients.txt", sep=",")
df_experiment.head()

Unnamed: 0,client_id,Variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control


In [16]:
# Merge client experiment group into web data
df_web = df_web.merge(df_experiment, on='client_id', how='left')
df_web.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,time_diff,step_order,step_diff,is_error,completed,Variation
0,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:22:17,NaT,4,,False,True,Test
1,3561384,451664975_1722933822,100012776_37918976071_457913,confirm,2017-04-26 13:23:09,0 days 00:00:52,4,0.0,False,True,Test
2,9056452,306992881_89423906595,1000165_4190026492_760066,start,2017-06-04 01:07:29,NaT,0,,False,True,
3,9056452,306992881_89423906595,1000165_4190026492_760066,step_1,2017-06-04 01:07:32,0 days 00:00:03,1,1.0,False,True,
4,9056452,306992881_89423906595,1000165_4190026492_760066,step_2,2017-06-04 01:07:56,0 days 00:00:24,2,1.0,False,True,


In [17]:
# Calculate completion rate for each variation group
completion_rate_by_variation = (
    df_web[df_web['process_step'] == 'confirm']
    .groupby('Variation')['visit_id']
    .nunique()
    / df_web.groupby('Variation')['visit_id'].nunique()
)

completion_rate_by_variation

Variation
Control    0.498493
Test       0.585173
Name: visit_id, dtype: float64

In [18]:
# Average time spent on each step per variation group
avg_time_by_variation = (
    df_web.groupby(['Variation', 'process_step'])['time_diff']
    .mean()
    .unstack()
)

avg_time_by_variation

process_step,confirm,start,step_1,step_2,step_3
Variation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Control,0 days 00:02:09.770632239,0 days 00:02:58.362727087,0 days 00:00:43.180969957,0 days 00:00:38.838146459,0 days 00:01:34.159650694
Test,0 days 00:02:09.884894028,0 days 00:02:33.498254799,0 days 00:00:37.790430247,0 days 00:00:48.272585264,0 days 00:01:37.356254658


In [19]:
# Calculate error rate for each variation group
error_rate_by_variation = (
    df_web[df_web['is_error']]
    .groupby('Variation')['visit_id'].nunique()
    / df_web.groupby('Variation')['visit_id'].nunique()
)

error_rate_by_variation

Variation
Control    0.207649
Test       0.272296
Name: visit_id, dtype: float64

In [21]:
!pip install statsmodels



In [38]:
import statsmodels.api as sm

ModuleNotFoundError: No module named 'statsmodels'