In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.stats.api as sms
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
pd.set_option('display.max_columns' , 100)
pd.set_option('display.width' , 120)

In [17]:
data_path = r"C:\Users\pc1\ab-project-sta\events.csv"

df = pd.read_csv(data_path , low_memory = False)

df.info()
print("*"*45)
print(df.shape)
print("*"*45)
print(df.dtypes)
print("*"*45)
display(df.isna().sum())

print("*"*45)
display(df.sample(5 , random_state = 56))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   timestamp      int64  
 1   visitorid      int64  
 2   event          object 
 3   itemid         int64  
 4   transactionid  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 105.1+ MB
*********************************************
(2756101, 5)
*********************************************
timestamp          int64
visitorid          int64
event             object
itemid             int64
transactionid    float64
dtype: object
*********************************************


timestamp              0
visitorid              0
event                  0
itemid                 0
transactionid    2733644
dtype: int64

*********************************************


Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
1004238,1440312696369,38970,view,361281,
1680937,1431649595137,499099,view,392734,
2450941,1437323778248,1236389,view,249432,
1273797,1441643899278,81392,view,227332,
1127973,1440906928835,869572,view,344131,


In [19]:
df = df.copy()

df['timestamp'] = pd.to_datetime(df['timestamp'] , unit='ms' , errors = 'corece')
df = df.dropna(subset = ['timestamp'])
df = df.dropna(subset = ['itemid'])
df = df.sort_values('timestamp')

In [20]:
display(df.sample(10))

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
2720923,2015-07-30 15:34:45.164,822537,view,64926,
1206956,2015-09-03 13:50:21.960,443885,view,218770,
1950411,2015-05-27 02:29:17.457,720494,view,164941,
1400217,2015-09-13 16:20:00.064,326834,view,303861,
1932652,2015-05-24 21:06:16.622,1291242,view,93367,
603388,2015-06-30 16:13:49.183,860070,view,77901,
595210,2015-07-01 03:52:09.574,236395,addtocart,161623,
1997271,2015-05-29 03:09:35.653,2847,view,288528,
104079,2015-06-05 18:09:14.559,804884,view,133907,
1677764,2015-05-13 10:46:58.878,1299147,view,436846,


In [21]:
df['event'].value_counts()

event
view           2664312
addtocart        69332
transaction      22457
Name: count, dtype: int64

In [27]:
add_to_cart_rate = df['event'].eq('addtocart').mean()
add_to_cart_rate

np.float64(0.02515582701795036)

In [28]:
conversion_rate = df['event'].eq('transaction').mean()
conversion_rate

np.float64(0.008148104877143472)

In [32]:
user = df.groupby('visitorid').agg(
    event_count=('event', 'count'),
    had_add=('event', lambda x: (x=='addtocart').any()),
    had_txn=('event', lambda x: (x=='transaction').any())
).reset_index()

group_add = user[user['had_add'] == True]['event_count']
group_view = user[user['had_add'] == False]['event_count']

In [33]:
ci_add = sms.DescrStatsW(group_add).tconfint_mean()
ci_view = sms.DescrStatsW(group_view).tconfint_mean()

display(ci_add , ci_view)

(np.float64(10.241043685881179), np.float64(11.722956102040987))

(np.float64(1.703776446397503), np.float64(1.7153195474938348))

In [36]:
t_stat , p_value = stats.ttest_ind(group_add , group_view , equal_var = False)

print(f"t-test is: {t_stat: .10f}   ,   p-value is : {p_value : .10f}")

t-test is:  24.5273652780   ,   p-value is :  0.0000000000


In [54]:
def cohens_d(x, y):
    nx, ny = len(x), len(y)
    pooled_std = np.sqrt(((nx-1)*x.var() + (ny-1)*y.var()) / (nx+ny-2))
    return (x.mean() - y.mean()) / pooled_std

effect = cohens_d(group_add, group_view)
effect

np.float64(0.7423272550505853)

In [60]:
power = sms.TTestIndPower().power(
    effect_size = effect,
    nobs1 = len(group_add),
    ratio = len(group_view)/len(group_add),
    alpha = 0.5
)
power

np.float64(1.0)