In [1]:
from scipy import stats
import numpy as np
import pandas as pd

In [11]:
pd.options.display.max_rows = 500

file_names = {
    "interact": "Homepage Version 1 - Interact, 5-29-2013",
    "connect": "Homepage Version 2 - Connect, 5-29-2013",
    "learn": "Homepage Version 3 - Learn, 5-29-2013",
    "help": "Homepage Version 4 - Help, 5-29-2013",
    "services": "Homepage Version 5 - Services, 5-29-2013",
}

path = '../data/CrazyEgg/'
interact_df = pd.read_csv(path + file_names['interact'] + f"/Element list {file_names['interact']}.csv")
connect_df = pd.read_csv(path + file_names['connect'] + f"/Element list {file_names['connect']}.csv")
learn_df = pd.read_csv(path + file_names['learn'] + f"/Element list {file_names['learn']}.csv")
help_df = pd.read_csv(path + file_names['help'] + f"/Element list {file_names['help']}.csv")
services_df = pd.read_csv(path + file_names['services'] + f"/Element list {file_names['services']}.csv")

In [116]:
interact_clicks = interact_df.loc[interact_df.Name=='INTERACT', 'No. clicks'].values[0]
connect_clicks = connect_df.loc[connect_df.Name=='CONNECT', 'No. clicks'].values[0]
learn_clicks = learn_df.loc[learn_df.Name=='LEARN', 'No. clicks'].values[0]
help_clicks = help_df.loc[help_df.Name=='HELP', 'No. clicks'].values[0]
services_clicks = services_df.loc[services_df.Name=='SERVICES', 'No. clicks'].values[0]

interact_home_page_visits = 10283
connect_home_page_visits = 2742
learn_home_page_visits = 2747
help_home_page_visits = 3180
services_home_page_visits = 2064

button_texts = ['Interact', 'Connect', 'Learn', 'Help', 'Services']
home_page_visits = [interact_home_page_visits, connect_home_page_visits, learn_home_page_visits, help_home_page_visits, services_home_page_visits]
clicks_throughs = [interact_clicks, connect_clicks, learn_clicks, help_clicks, services_clicks]
ctr_results = pd.DataFrame({
    "home_page_visits": home_page_visits,
    "click_throughs": clicks_throughs,
    "ctr": ctr
}, index=button_texts)

ctr_results['ctr'] = ctr_results.click_throughs/ctr_results.home_page_visits

ctr_results.sort_values('ctr', ascending=False)

Unnamed: 0,home_page_visits,click_throughs,ctr
Services,2064,45,0.021802
Connect,2742,53,0.019329
Help,3180,38,0.01195
Learn,2747,21,0.007645
Interact,10283,42,0.004084


In [99]:
contingency_table = ctr_results.T.drop('ctr')
contingency_table.rename({'click_throughs': 'Click', 'home_page_visits': 'No-click'}, inplace=True)

contingency_table.loc['No-click'] = contingency_table.loc['No-click'] - contingency_table.loc['Click']
contingency_table

Unnamed: 0,Interact,Connect,Learn,Help,Services
No-click,10241.0,2689.0,2726.0,3142.0,2019.0
Click,42.0,53.0,21.0,38.0,45.0


In [112]:
chisq, pvalue, df, expected = stats.chi2_contingency(contingency_table)
print(contingency_table)
print('\n')
print('chisq:', chisq)
print('pvalue:', pvalue)
print('df:', df)
print('expected:\n', expected)

          Interact  Connect   Learn    Help  Services
No-click   10241.0   2689.0  2726.0  3142.0    2019.0
Click         42.0     53.0    21.0    38.0      45.0


chisq: 96.7432353798328
pvalue: 4.852334301093838e-20
df: 4
expected:
 [[10185.6305196   2716.03606776  2720.98872288  3149.88865626
   2044.4560335 ]
 [   97.3694804     25.96393224    26.01127712    30.11134374
     19.5439665 ]]


In [113]:
contingency_table_2 = contingency_table.drop('Interact', axis=1, errors='ignore')
chisq, pvalue, df, expected = stats.chi2_contingency(contingency_table_2)
print(contingency_table_2)
print('\n')
print('chisq:', chisq)
print('pvalue:', pvalue)
print('df:', df)
print('expected:\n', expected)

          Connect   Learn    Help  Services
No-click   2689.0  2726.0  3142.0    2019.0
Click        53.0    21.0    38.0      45.0


chisq: 22.450979530401828
pvalue: 5.25509870228566e-05
df: 3
expected:
 [[2701.89061772 2706.8174788  3133.48364856 2033.80825491]
 [  40.10938228   40.1825212    46.51635144   30.19174509]]


In [114]:
contingency_table_3 = contingency_table_2.drop('Learn', axis=1, errors='ignore')
chisq, pvalue, df, expected = stats.chi2_contingency(contingency_table_3)
print(contingency_table_3)
print('\n')
print('chisq:', chisq)
print('pvalue:', pvalue)
print('df:', df)
print('expected:\n', expected)

          Connect    Help  Services
No-click   2689.0  3142.0    2019.0
Click        53.0    38.0      45.0


chisq: 8.576830710947847
pvalue: 0.013726659948517534
df: 2
expected:
 [[2695.30428249 3125.84522915 2028.85048835]
 [  46.69571751   54.15477085   35.14951165]]


## Bonferroni Adjustment

When the best result is not clear, we use the Bonferroni Adjustment method to compare the Chi-squared of each two pairs of samples.

In this case we reduce the p-value by dividing it by the number of comparisons.

In [117]:
contingency_table

Unnamed: 0,Interact,Connect,Learn,Help,Services
No-click,10241.0,2689.0,2726.0,3142.0,2019.0
Click,42.0,53.0,21.0,38.0,45.0


In [133]:
significance_level = 0.95
p_value_threshold = 1 - significance_level
num_comparisons = 0
print('What is the number of possible pair comparisons of the 5 button choices?') 
for i in range(5):
    for j in range(i+1,5):
        print(button_texts[i], button_texts[j])
        num_comparisons += 1

print('\n')
print("num_comparisons:", num_comparisons)
post_hoc_p_value_threshold = round(p_value_threshold / num_comparisons, 4)
print("New p-value:", post_hoc_p_value_threshold)


What is the number of possible pair comparisons of the 5 button choices?
Interact Connect
Interact Learn
Interact Help
Interact Services
Connect Learn
Connect Help
Connect Services
Learn Help
Learn Services
Help Services


num_comparisons: 10
New p-value: 0.005


In [135]:
# Note that each one of these comparisons has the null hypothesis that the difference between
# the results is due to chance.

bonferroni_comparisons = []
bonferroni_p_values = []
better_than_p_value_threshold = []

for i in range(5):
    for j in range(i+1,5):
        chisq, pvalue, df, expected = stats.chi2_contingency(contingency_table[[button_texts[i], button_texts[j]]])
        
        bonferroni_comparisons.append(button_texts[i] + ' vs ' + button_texts[j])
        bonferroni_p_values.append(pvalue)
        better_than_p_value_threshold.append(pvalue < post_hoc_p_value_threshold)

bonferroni_results = pd.DataFrame({
    "comparison": bonferroni_comparisons,
    "p_values": bonferroni_p_values,
    "Rejects null hypothesis": better_than_p_value_threshold
})
bonferroni_results.sort_values('p_values')

Unnamed: 0,comparison,p_values,Rejects null hypothesis
3,Interact vs Services,5.719451e-18,True
0,Interact vs Connect,2.225033e-16,True
2,Interact vs Help,9.036e-07,True
8,Learn vs Services,5.0541e-05,True
4,Connect vs Learn,0.0002767888,True
9,Help vs Services,0.007370912,False
1,Interact vs Learn,0.02541982,False
5,Connect vs Help,0.02808815,False
7,Learn vs Help,0.1251275,False
6,Connect vs Services,0.6188771,False


## How do we decide which button is best?

Services and Connect show the best CTR values. 
Their results both reject the null hypothesis that the difference between their CTR numbers and that of the original Interact button is due to chance.

However, the difference between the CTR values of Services and Connect are not significant. We cannot say with confidence which performs better.

To decide which button is better we must turn to the other rates which were supplied in the course material.
Services has a far lower Drop-Off Rate and Homepage Return Rate. 
Therefore, Services is the best choice of button text.