# MSU Library Project

In [1]:
import pandas as pd
from scipy.stats import chi2_contingency
#from scipy import stats
import seaborn as sns

## What was the click-through rate for each version?  

In [2]:
# 10283 visits - 42 clicks on INTERACT
# 2742 visits - 53 for CONNECT
# 2747 visits - 21  clicks on LEARN
# 3180 vistis - 38 clicks on HELP
# 2064 visits - 45 clicks on SERVICES

In [20]:
data = {
    "Version": ['Interact', 'Connect', 'Learn', 'Help', 'Services'],
    "Clicks": [42, 53, 21, 38, 45],
    "Visits": [10283, 2742, 2747, 3180, 2064] 
}

observed_clicks_CTR = pd.DataFrame(data)
observed_clicks_CTR['CTR'] = observed_clicks_CTR['Clicks'] / observed_clicks_CTR['Visits']

observed_clicks_CTR.sort_values('CTR', ascending=False)

Unnamed: 0,Version,Clicks,Visits,CTR
4,Services,45,2064,0.021802
1,Connect,53,2742,0.019329
3,Help,38,3180,0.01195
2,Learn,21,2747,0.007645
0,Interact,42,10283,0.004084


## Which version was the winner?  

Null Hypothesis: All versions have the same CTR.  
Alternative Hypothesis: There is a difference in the CTR for the different versions.

In [21]:
alpha = 0.1

In [22]:
data = {
    "Clicks": [42, 53, 21, 38, 45],
    "Visits": [10283, 2742, 2747, 3180, 2064]
}

observed_clicks = pd.DataFrame(data)
observed_clicks['No_click'] = observed_clicks['Visits'] - observed_clicks['Clicks']

observed_clicks = observed_clicks[['Clicks', 'No_click']].T
observed_clicks.columns = ['Interact', 'Connect', 'Learn', 'Help', 'Services']

observed_clicks


Unnamed: 0,Interact,Connect,Learn,Help,Services
Clicks,42,53,21,38,45
No_click,10241,2689,2726,3142,2019


In [23]:
chisq, pvalue, df, expected = chi2_contingency(observed_clicks)

In [38]:
pvalue

0.007370912499282061

In [39]:
if abs(pvalue) < abs(alpha):
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

Reject the null hypothesis


### Perform a Post-hoc-test

In [40]:
num_comparisons = 10
alpha_pht = 0.1 / num_comparisons

Null Hypothesis: Both versions have the same CTR.  
Alternative Hypothesis: There is a difference in the CTR between the two versions.

In [30]:
observed_clicks_1 = observed_clicks.iloc[:, :2]
observed_clicks_1

Unnamed: 0,Interact,Connect
Clicks,42,53
No_click,10241,2689


In [31]:
chisq, pvalue, df, expected = chi2_contingency(observed_clicks_1)

In [32]:
if abs(pvalue) < abs(alpha_pht):
    print("Reject the null hypothesis")
else:
    print("Fail to reject the null hypothesis")

Reject the null hypothesis


In [33]:
columns = observed_clicks.columns

column_combinations = []

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        column_combinations.append((columns[i], columns[j]))

column_combinations

[('Interact', 'Connect'),
 ('Interact', 'Learn'),
 ('Interact', 'Help'),
 ('Interact', 'Services'),
 ('Connect', 'Learn'),
 ('Connect', 'Help'),
 ('Connect', 'Services'),
 ('Learn', 'Help'),
 ('Learn', 'Services'),
 ('Help', 'Services')]

In [34]:
for col_pair in column_combinations:
    contingency_table = observed_clicks[list(col_pair)]
    chisq, pvalue, df, expected = chi2_contingency(contingency_table)

    if pvalue < alpha_pht:
        print(f"Reject the null hypothesis for {col_pair[0]} vs {col_pair[1]}, p-value: {pvalue}")
    else:
        print(f"Fail to reject the null hypothesis for {col_pair[0]} vs {col_pair[1]}, p-value: {pvalue}")


Reject the null hypothesis for Interact vs Connect, p-value: 2.2250331654688293e-16
Fail to reject the null hypothesis for Interact vs Learn, p-value: 0.025419824342152637
Reject the null hypothesis for Interact vs Help, p-value: 9.03599988558687e-07
Reject the null hypothesis for Interact vs Services, p-value: 5.719451224375125e-18
Reject the null hypothesis for Connect vs Learn, p-value: 0.00027678881264505827
Fail to reject the null hypothesis for Connect vs Help, p-value: 0.02808815288948292
Fail to reject the null hypothesis for Connect vs Services, p-value: 0.6188771123975272
Fail to reject the null hypothesis for Learn vs Help, p-value: 0.12512753088691322
Reject the null hypothesis for Learn vs Services, p-value: 5.0540996583731365e-05
Reject the null hypothesis for Help vs Services, p-value: 0.007370912499282061


There is a statistically significant difference between:  

Interact vs Connect  
Interact vs Help  
Interact vs Services  
Connect vs Learn  
Learn vs Services   
Help vs Services  

In [37]:
observed_clicks_CTR.sort_values('CTR', ascending=False)

Unnamed: 0,Version,Clicks,Visits,CTR
4,Services,45,2064,0.021802
1,Connect,53,2742,0.019329
3,Help,38,3180,0.01195
2,Learn,21,2747,0.007645
0,Interact,42,10283,0.004084


Services performs statistically better than Help however we cannot say it performs statistically better than Connect. 
Connect performs statistically better than Learn however we cannot say it performs statistically better than Help.  
So the winner are Services and Connect??

Looking a drop-out Rates as well - the winner is Services