In [1]:
import pandas as pd
from sqlalchemy import create_engine,text
import pymysql.cursors
import os
# import getpass
import urllib.parse

In [None]:
pw_raw = os.getenv('mysql_xt')
pw = urllib.parse.quote_plus(pw_raw)

# python --> mysql

In [None]:
connection_string = 'mysql+pymysql://root:' + pw + '@127.0.0.1:3306/'
engine = create_engine(connection_string)

In [None]:
with engine.connect() as conn:
    conn.execute(text("CREATE DATABASE IF NOT EXISTS project_ab_testing"))

In [None]:
client_prof=pd.read_csv('df_final_demo.txt')
client_prof.to_sql('client_profile',engine, 'project_ab_testing', if_exists='replace', index=False)

In [None]:
experiment_roster=pd.read_csv('df_final_experiment_clients.txt')
experiment_roster.to_sql('experiment_roster',engine, 'project_ab_testing', if_exists='replace', index=False)

In [None]:
experiment_roster_no_null=experiment_roster.dropna()
experiment_roster_no_null.to_sql('experiment_roster_no_null',engine, 'project_ab_testing', if_exists='replace', index=False)

In [None]:
pt1=pd.read_csv('df_final_web_data_pt_1.txt')
pt2=pd.read_csv('df_final_web_data_pt_2.txt')
digital_footprints = pd.concat([pt1,pt2])
digital_footprints = digital_footprints[~digital_footprints.duplicated()]
digital_footprints.to_sql('digital_footprints',engine, 'project_ab_testing', if_exists='replace', index=False)

In [None]:
df_with_variation = digital_footprints.merge(experiment_roster, on='client_id', how='left').sort_values(by="date_time")
test_group = df_with_variation[df_with_variation['Variation'] == 'Test']
test_group.to_sql('test_group',engine, 'project_ab_testing', if_exists='replace', index=False)
control_group = df_with_variation[df_with_variation['Variation'] == 'Control']
control_group.to_sql('control_group',engine, 'project_ab_testing', if_exists='replace', index=False)

In [None]:
summary_table=pd.read_csv('summary_table.csv')
summary_table.to_sql('summary_table',engine, 'project_ab_testing', if_exists='replace', index=False)

In [None]:
test_group=pd.read_csv('test_group.csv')
test_group.to_sql('test_group',engine, 'project_ab_testing', if_exists='replace', index=False)
control_group=pd.read_csv('control_group.csv')
control_group.to_sql('control_group',engine, 'project_ab_testing', if_exists='replace', index=False)

# mysql -- > python

In [None]:
db='project_ab_testing'
connection_string = 'mysql+pymysql://root:' + pw + '@127.0.0.1:3306/'+db
engine = create_engine(connection_string)

In [None]:
with engine.connect() as connection:
    query = text('SELECT * FROM df_with_variation_clean')
    result = connection.execute(query)
    df_with_variation_clean = pd.DataFrame(result.all())

In [None]:
df_with_variation_clean.nunique()

In [None]:
with engine.connect() as connection:
    query = text('SELECT * FROM each_step_count')
    result = connection.execute(query)
    each_step_count = pd.DataFrame(result.all())

In [None]:
each_step_count  # here visit_id all unique --> 69205 rows

In [None]:
with engine.connect() as connection:
    query = text('SELECT * FROM number_visits_per_client')
    result = connection.execute(query)
    number_visits_per_client = pd.DataFrame(result.all())

In [None]:
number_visits_per_client

In [None]:
with engine.connect() as connection:
    query = text('SELECT * FROM last_process_step')
    result = connection.execute(query)
    last_process_step = pd.DataFrame(result.all())

In [None]:
last_process_step

In [None]:
all_data_for_step = pd.merge(each_step_count,last_process_step,on=['visit_id','visitor_id'],how='left')

In [None]:
all_data_for_step #.sort_values('client_id')

In [None]:
all_data_for_step.nunique()

In [None]:
all_visit_data=pd.merge(number_visits_per_client,all_data_for_step,on=['client_id', 'variation'],how='inner').sort_values('client_id')

In [None]:
# add column to check if last_step is "confirm"
all_visit_data['last_step_confirm'] = all_visit_data['last_process_step'].apply(lambda x: 1 if x == 'confirm' else 0)

In [None]:
# add column "step_hierarchy"
def highest_step(row):
    if row['confirm_count'] > 0:
        return 'confirm'
    elif row['step_2_count'] > 0:
        return 'step_2'
    elif row['step_1_count'] > 0:
        return 'step_1'
    elif row['start_count'] > 0:
        return 'start'
    else:
        return None

all_visit_data['step_hierarchy'] = all_visit_data.apply(highest_step, axis=1)

In [None]:
# add column to check if step_hierarchy is "confirm", which means task completed 
all_visit_data['if_confirm'] = all_visit_data['step_hierarchy'].apply(lambda x: 1 if x == 'confirm' else 0)

In [None]:
all_visit_data #.head(50)

In [None]:
# export test_group frame to CSV

# Get the current working directory
current_directory = os.getcwd()

# Specify the file name
file_name = "all_visit_steps.csv"

# Join the current directory with the file name to create the file path
file_path = os.path.join(current_directory, file_name)

# Export the summary_table DataFrame to a CSV file
all_visit_data.to_csv(file_path, index=False)

print(f"Test group table has been exported to {file_path}")

In [None]:
has_duplicates = all_visit_data.duplicated().any()

if has_duplicates:
    print("row duplicated exist.")
else:
    print("row duplicated not exist.")

In [None]:
test_group_step = all_visit_data[all_visit_data['variation'] == 'Test']
test_group_step

In [None]:
test_group_step.nunique()

In [None]:
control_group_step = all_visit_data[all_visit_data['variation'] == 'Control']
control_group_step

In [None]:
control_group_step.nunique()

In [None]:
all_visit_data.to_sql('all_visit_data',engine, 'project_ab_testing', if_exists='replace', index=False)
test_group_step.to_sql('test_group_step',engine, 'project_ab_testing', if_exists='replace', index=False)
control_group_step.to_sql('control_group_step',engine, 'project_ab_testing', if_exists='replace', index=False)

# Problems encountered:
- two different process_step with same time_stamp
- same visitor_id/visit_id for two different client_id


In [None]:
# two different process_step with same time_stamp, ex
all_visit_data[all_visit_data['visit_id']=='377986493_6391607481_598681']

In [None]:
# same visitor_id/visit_id for two different client_id, ex -> same variation
all_visit_data[all_visit_data['visit_id']=='30714723_53564046699_567312']

In [None]:
# same visitor_id/visit_id for two different client_id, ex -> different variation
all_visit_data[all_visit_data['visit_id']=='92588242_2876965505_25554']

In [None]:
visit_id_counts1 = each_step_count['visit_id'].value_counts()

duplicate_visit_ids1 = visit_id_counts1[visit_id_counts1 > 1].index.tolist()

for visit_id in duplicate_visit_ids1:
    num_occurrences1 = visit_id_counts1[visit_id]
    print(f"Visit ID: {visit_id}, Num Occurrences: {num_occurrences1}")


In [None]:
visit_id_counts2 = last_process_step['visit_id'].value_counts()

duplicate_visit_ids2 = visit_id_counts2[visit_id_counts2 > 1].index.tolist()

for visit_id in duplicate_visit_ids2:
    num_occurrences2 = visit_id_counts2[visit_id]
    print(f"Visit ID: {visit_id}, Num Occurrences: {num_occurrences2}")

-  For bounce rate : how to define a "bounce" action ?
   - highest step reached 
   - last step before leaving (I used this one to calculate, but not sure)
-  Total number of visits : all visits or all clients ? (all visits maybe)

In [None]:
test_group_step['last_process_step'].value_counts()

In [None]:
test_group_step['step_hierarchy'].value_counts()

In [None]:
control_group_step['last_process_step'].value_counts()

In [None]:
control_group_step['step_hierarchy'].value_counts()

In [None]:
# number of clients
num_clients_test_group = test_group_step['client_id'].nunique()
print('Total number of clients for test group :', num_clients_test_group)
num_clients_control_group = control_group_step['client_id'].nunique()
print('Total number of clients for control group :', num_clients_control_group)

# Calculation of completion rate

In [None]:
total_num_visit_test_group = test_group_step['visit_id'].nunique()
print('Total number of visits for test group :',total_num_visit_test_group) 

In [None]:
control_num_visit_test_group = control_group_step['visit_id'].nunique()
print('Total number of visits for test group :',total_num_visit_control_group) 

In [None]:
# number of confirm for test group
confirm_count_test = test_group_step[test_group_step['if_confirm'] == 1]['if_confirm'].sum()
print("Confirm count for test group:", confirm_count_test)
# completion rate of test group
confirm_rate_test = (confirm_count_test / total_num_visit_test_group) * 100
print("Completion rate of test group:", round(confirm_rate_test,2), "%")

In [None]:
# number of confirm for control group
confirm_count_control = control_group_step[control_group_step['if_confirm'] == 1]['if_confirm'].sum()
print("Confirm count for control group:", confirm_count_control)
# completion rate of control group
confirm_rate_control = (confirm_count_control / total_num_visit_control_group) * 100
print("Completion rate of control group:", round(confirm_rate_control,2), "%")

# Calculation of "stop step" rate
- --> from which step the customers cannot go further (per visit)

In [None]:
# total count of each step in test group 
counts={}
steps = ['start', 'step_1', 'step_2', 'step_3']

for step in steps:
    counts[step]  = test_group_step[test_group_step['last_process_step'] == step].shape[0]
    print(f"{step.upper()} count in test group:", counts[step])

start_count_test = counts['start']
step_1_count_test = counts['step_1']
step_2_count_test = counts['step_2']
step_3_count_test = counts['step_3']

In [None]:
# calculate bounce rate for each step in test group
stop_rates = {}

for step in steps:
    stop_rate = (counts[step] / total_num_visit_test_group) * 100
    stop_rates[step] = stop_rate

for step, stop_rate in stop_rates.items():
    print(f"Stop rate for the '{step}' step in test group:", round(stop_rate, 2), "%")

In [None]:
# total count of each step in control group 
counts2={}
steps2 = ['start', 'step_1', 'step_2', 'step_3']

for step in steps2:
    counts2[step]  = control_group_step[control_group_step['last_process_step'] == step].shape[0]
    print(f"{step.upper()} count in test group:", counts2[step])

start_count_control = counts2['start']
step_1_count_control = counts2['step_1']
step_2_count_control = counts2['step_2']
step_3_count_control = counts2['step_3']

In [None]:
# calculate bounce rate for each step in control group
stop_rates2 = {}

for step in steps2:
    stop_rate2 = (counts2[step] / total_num_visit_control_group) * 100
    stop_rates2[step] = stop_rate2

for step, stop_rate in stop_rates2.items():
    print(f"Stop rate for the '{step}' step in control group:", round(stop_rate, 2), "%")

# Bounce rate
- --> the number of customers who only have "start" step (per visit)

In [None]:
all_visit_steps=pd.read_csv('all_visit_steps.csv')

In [None]:
all_visit_steps

In [None]:
columns_to_drop = ['visitor_id', 'last_step_confirm','step_hierarchy','if_confirm']
all_visit_steps = all_visit_steps.drop(columns=columns_to_drop)

In [None]:
all_visit_steps

In [None]:
summary_table=pd.read_csv('summary_table.csv')
summary_table

In [None]:
df_merged = pd.merge(all_visit_steps, summary_table[['client_id', 'step_hierarchy']], on='client_id')

In [None]:
all_visit_steps=df_merged.copy()

In [None]:
# export test_group frame to CSV

# Get the current working directory
current_directory = os.getcwd()

# Specify the file name
file_name = "all_visit_steps_final.csv"

# Join the current directory with the file name to create the file path
file_path = os.path.join(current_directory, file_name)

# Export the summary_table DataFrame to a CSV file
all_visit_steps.to_csv(file_path, index=False)

print(f"Test group table has been exported to {file_path}")

In [2]:
all_visit_steps=pd.read_csv('all_visit_steps_final.csv')

In [3]:
test_group_step = all_visit_steps[all_visit_steps['variation'] == 'Test']
control_group_step = all_visit_steps[all_visit_steps['variation'] == 'Control']

In [4]:
total_num_visit_test_group = test_group_step['visit_id'].nunique()
print('Total number of visits for test group :',total_num_visit_test_group) 
total_num_visit_control_group = control_group_step['visit_id'].nunique()
print('Total number of visits for test group :',total_num_visit_control_group) 

Total number of visits for test group : 37136
Total number of visits for test group : 32189


In [None]:
test_group_step

In [5]:
only_start_test=test_group_step[test_group_step['step_hierarchy']==1]
num_bounce_test=len(only_start_test)
# print('Number of bounce for test group:',num_bounce_test)
bounce_rate_test = (num_bounce_test / total_num_visit_test_group) * 100
print("Bounce rate of test group:", round(bounce_rate_test,2), "%")

Bounce rate of test group: 7.48 %


In [6]:
only_start_control=control_group_step[control_group_step['step_hierarchy']==1]
num_bounce_control=len(only_start_control)
# print('Number of bounce for control group:',num_bounce_control)
bounce_rate_control = (num_bounce_control / total_num_visit_control_group) * 100
print("Bounce rate of control group:", round(bounce_rate_control,2), "%")

Bounce rate of control group: 11.26 %
