# Importing Libraries


In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt 
import seaborn as sns 

import statsmodels.api as sm

# Data Cleaning


In [2]:
# Loading the datasets
test = pd.read_csv('test.csv')
countries = pd.read_csv('countries.csv')

In [3]:
test.head()

Unnamed: 0,id,time,con_treat,page,converted
0,851104,11:48.6,control,old_page,0
1,804228,01:45.2,control,old_page,0
2,661590,55:06.2,treatment,new_page,0
3,853541,28:03.1,treatment,new_page,0
4,864975,52:26.2,control,old_page,1


In [4]:
countries.head()

Unnamed: 0,id,country
0,834778,UK
1,928468,US
2,822059,UK
3,711597,UK
4,710616,UK


In [5]:
# change column names 
test.columns = ["user_id", "timestamp", "group", "landing_page", "converted"]
test.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,11:48.6,control,old_page,0
1,804228,01:45.2,control,old_page,0
2,661590,55:06.2,treatment,new_page,0
3,853541,28:03.1,treatment,new_page,0
4,864975,52:26.2,control,old_page,1


In [6]:
# Number of rows
test.shape[0]

294478

In [7]:
# Number of unique users
test.user_id.nunique()

290584

In [8]:
# general info
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294478 entries, 0 to 294477
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294478 non-null  int64 
 1   timestamp     294478 non-null  object
 2   group         294478 non-null  object
 3   landing_page  294478 non-null  object
 4   converted     294478 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 11.2+ MB


In [9]:
# missing values
test.isna().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [10]:
# Does the number of new_page and treatment match?
number_treat = test[test["group"] == "treatment"].shape[0]
number_new_page = test[test["landing_page"] == "new_page"].shape[0]
difference = number_treat - number_new_page

In [11]:
difference


37

There is mismatch between number of users assigned to treatment and the number of those landed on treatment page. This might indicate a problem with the data and needs further exploration.

In [12]:
# lets see those rows 
test[( test["group"] == "treatment") & (test["landing_page"] == "old_page")]

Unnamed: 0,user_id,timestamp,group,landing_page,converted
308,857184,34:59.8,treatment,old_page,0
327,686623,26:40.7,treatment,old_page,0
357,856078,29:30.4,treatment,old_page,0
685,666385,11:54.8,treatment,old_page,0
713,748761,47:44.4,treatment,old_page,0
...,...,...,...,...,...
293773,688144,34:50.5,treatment,old_page,1
293817,876037,15:09.0,treatment,old_page,1
293917,738357,37:55.7,treatment,old_page,0
294014,813406,25:33.2,treatment,old_page,0


In [13]:
test_mismatch = test[(test["group"] == "treatment") & (test["landing_page"] == "old_page")
               |(test["group"] == "control") & (test["landing_page"] == "new_page")]

number_mismatch = test_mismatch.shape[0]

percent_mismatch = round(number_mismatch / len(test) * 100, 2)

print(number_mismatch) # Number of mismatched rows

print(percent_mismatch) # Percent of mismatched rows

3893
1.32


As you can see, there are 3893 rows where treatment does not match with new_page or control does not match with old_page, we cannot be sure if this row truly received the new or old page.

In [14]:
df = test[(test["group"] == "treatment") & (test["landing_page"] == "new_page")
        |(test["group"] == "control") & (test["landing_page"] == "old_page")]

len(df)

290585

In [15]:
# Double Check all of the correct rows were removed - this should be 0
df[((df['group'] == 'treatment') == (df['landing_page'] == 'new_page')) == False].shape[0]

0

In [16]:
test_mismatch = df[( df["group"] == "treatment") & (df["landing_page"] == "old_page")
               |( df["group"] == "control") & (df["landing_page"] == "new_page")]

number_mismatch = test_mismatch.shape[0]

percent_mismatch = round(number_mismatch / len(test) * 100, 2)

print(number_mismatch) # Number of mismatched rows

print(percent_mismatch) # Percent of mismatched rows

0
0.0


In [17]:
# unique user id in df
df.user_id.nunique()

290584

In [18]:
# number of repeated ids in df
len(df) - df.user_id.nunique()

1

In [19]:
# drop the duplicated row
df = df.drop_duplicates("user_id") 

In [20]:
# Douple Check that it is actually dropped
len(df) - df.user_id.nunique()

0

#  Probability

In [21]:
# Percent of convergance
# The probability of an individual converting regardless of the page they receive
df.converted.mean() * 100

11.959708724499627

In [22]:
# Given that an individual was in the control group, what is the probability they converted?
# Given that an individual was in the treatment group, what is the probability they converted?
df.user_id = df.user_id.astype(str)
df.groupby("group").mean() * 100

Unnamed: 0_level_0,converted
group,Unnamed: 1_level_1
control,12.03863
treatment,11.880807


In [23]:
#What is the probability that an individual received the new page?
pd.DataFrame(df.landing_page.value_counts(normalize = True) * 100)

Unnamed: 0,landing_page
new_page,50.006194
old_page,49.993806


The probability that an individual received the new page is 50%

The probability of an individual converting regardless of the page they receive is 11.96%

Given that an individual was in the control group, the probability they converted is 12.04%

Given that an individual was in the treatment group, the probability they converted is 11.88%

# A/B Test

In [24]:
# number of unique users who converted
convert_old = df[(df["converted"] == 1) & (df["landing_page"] == "old_page")]['user_id'].nunique() 
# number of unique users who converted and visited the new page.
convert_new = df[(df["converted"] == 1) & (df["landing_page"] == "new_page")]['user_id'].nunique() 
# total number of unique users who visited the old page regardless of whether they converted
n_old = df[df["landing_page"] == "old_page"]['user_id'].nunique() 
# total number of unique users who visited the new page regardless of whether they converted.
n_new = df[df["landing_page"] == "new_page"]['user_id'].nunique() 

In [25]:
#Compute test statistic and p-value
z_score, p_value = sm.stats.proportions_ztest(np.array([convert_new,convert_old]),np.array([n_new,n_old]), alternative = 'larger')

In [26]:
# Print Z Score and P_Value
z_score, p_value 

(-1.3109241984234394, 0.9050583127590245)

Using test statistic and p-value, we reach the same coclusion: we can not reject the null

# Regression

Since each row is either a conversion or no conversion, we will use logestic regression to see if there is a significant difference in conversion based on which page a customer receives. However, we first need to create in df a column for the intercept, and create a dummy variable column for which page each user received.

In [27]:
# Creat the intercept 
df["intercept"] = 1
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted,intercept
0,851104,11:48.6,control,old_page,0,1
1,804228,01:45.2,control,old_page,0,1
2,661590,55:06.2,treatment,new_page,0,1
3,853541,28:03.1,treatment,new_page,0,1
4,864975,52:26.2,control,old_page,1,1


In [28]:
# Create ab_page column, which is 1 when an individual receives the treatment and 0 if control.
df["ab_page"] = df.group.apply(lambda x: 1 if (x == "treatment") else 0)
df.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted,intercept,ab_page
0,851104,11:48.6,control,old_page,0,1,0
1,804228,01:45.2,control,old_page,0,1,0
2,661590,55:06.2,treatment,new_page,0,1,1
3,853541,28:03.1,treatment,new_page,0,1,1
4,864975,52:26.2,control,old_page,1,1,0


In [29]:
# Instantiate and fit the regression model
model = sm.Logit(df['converted'], df[['intercept','ab_page']])
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.366118
         Iterations 6


0,1,2,3
Dep. Variable:,converted,No. Observations:,290584.0
Model:,Logit,Df Residuals:,290582.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 25 Dec 2023",Pseudo R-squ.:,8.077e-06
Time:,11:01:49,Log-Likelihood:,-106390.0
converged:,True,LL-Null:,-106390.0
Covariance Type:,nonrobust,LLR p-value:,0.1899

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-1.9888,0.008,-246.669,0.000,-2.005,-1.973
ab_page,-0.0150,0.011,-1.311,0.190,-0.037,0.007


The P-Value is 0.190, It is different from the one we obtained from the previous analysis because the null hypothesis is different in both cases.
Along with testing whether the conversion rate varies for different pages, we'll also add an effect based on the user's country of residence.

In [30]:
countries.head()

Unnamed: 0,id,country
0,834778,UK
1,928468,US
2,822059,UK
3,711597,UK
4,710616,UK


In [31]:
# Merge the countries dataframe with df 
countries.columns = ["user_id", "country"] # Rename columns in countries 
countries["user_id"] = countries["user_id"].astype(str) # Changing the "user_id" column to a 'string' data type.
# join between dataframes df and countries using the "user_id" column as a key.
comb = df.merge(countries, on = "user_id", how = "left") 
comb.head() 

Unnamed: 0,user_id,timestamp,group,landing_page,converted,intercept,ab_page,country
0,851104,11:48.6,control,old_page,0,1,0,US
1,804228,01:45.2,control,old_page,0,1,0,US
2,661590,55:06.2,treatment,new_page,0,1,1,US
3,853541,28:03.1,treatment,new_page,0,1,1,US
4,864975,52:26.2,control,old_page,1,1,0,US


In [32]:
# creating dummies for country and landing_page columns 
comb[['CA','UK','US']] = pd.get_dummies(comb['country'])
comb[['new_page','old_page']] = pd.get_dummies(comb['landing_page'])
comb.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted,intercept,ab_page,country,CA,UK,US,new_page,old_page
0,851104,11:48.6,control,old_page,0,1,0,US,0,0,1,0,1
1,804228,01:45.2,control,old_page,0,1,0,US,0,0,1,0,1
2,661590,55:06.2,treatment,new_page,0,1,1,US,0,0,1,1,0
3,853541,28:03.1,treatment,new_page,0,1,1,US,0,0,1,1,0
4,864975,52:26.2,control,old_page,1,1,0,US,0,0,1,0,1


In [33]:
# lets see if there is a relation between country and conversion
pd.pivot_table(data = comb, index = "country", values = "converted").sort_values(by = "converted", ascending = False) * 100

Unnamed: 0_level_0,converted
country,Unnamed: 1_level_1
UK,12.059449
US,11.95468
CA,11.53183


Country seems to have very little effect on convergence. We'll see its true impact with other features in the regression

In [34]:
# Instantiate and fit the regression model with country as an additional variable: 'CA' is a baseline
model = sm.Logit(comb['converted'], comb[['intercept','ab_page', 'UK','US']])
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.366113
         Iterations 6


0,1,2,3
Dep. Variable:,converted,No. Observations:,290584.0
Model:,Logit,Df Residuals:,290580.0
Method:,MLE,Df Model:,3.0
Date:,"Mon, 25 Dec 2023",Pseudo R-squ.:,2.323e-05
Time:,11:01:53,Log-Likelihood:,-106390.0
converged:,True,LL-Null:,-106390.0
Covariance Type:,nonrobust,LLR p-value:,0.176

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-2.0300,0.027,-76.249,0.000,-2.082,-1.978
ab_page,-0.0149,0.011,-1.307,0.191,-0.037,0.007
UK,0.0506,0.028,1.784,0.074,-0.005,0.106
US,0.0408,0.027,1.516,0.130,-0.012,0.093


In [35]:
# exponentiate the parameters to inteprete the result
np.exp(result.params)

intercept    0.131332
ab_page      0.985168
UK           1.051944
US           1.041599
dtype: float64

All the coefficients are statistically insignificant except the intercept. 
This comes inline with the initial conclusions that we have just made.


# Conclusion
All methods led to the same conclusion: treatment has no effect.