In [35]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats as ss
import plotly.express as px
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency, chi2
import ast
from scipy.stats import ttest_ind
from statsmodels.stats.proportion import proportions_ztest

users = pd.read_csv('/Users/siarheimasliankou/Documents/Work/DA_Practice/Statistics/Task_10/ab_users_data.csv')
orders = pd.read_csv('/Users/siarheimasliankou/Documents/Work/DA_Practice/Statistics/Task_10/ab_orders.csv')
products = pd.read_csv('/Users/siarheimasliankou/Documents/Work/DA_Practice/Statistics/Task_10/ab_products.csv')

In [9]:
data = orders.merge(users, on='order_id')

In [138]:
#Преобразуем данные:

data['product_ids'] = data['product_ids'].astype(str)
data['product_ids'] = data['product_ids'].apply(lambda x: list(ast.literal_eval(x)))
exploded_df = data.explode('product_ids').reset_index(drop=True)

In [8]:
df = exploded_df.rename(columns={'product_ids': 'product_id'})

Unnamed: 0,order_id,creation_time,product_id,user_id,action,time,date,group
0,1255,2022-08-26 00:00:19.000000,75,964,create_order,2022-08-26 00:00:19.000000,2022-08-26,0
1,1255,2022-08-26 00:00:19.000000,84,964,create_order,2022-08-26 00:00:19.000000,2022-08-26,0
2,1255,2022-08-26 00:00:19.000000,53,964,create_order,2022-08-26 00:00:19.000000,2022-08-26,0
3,1255,2022-08-26 00:00:19.000000,22,964,create_order,2022-08-26 00:00:19.000000,2022-08-26,0
4,1256,2022-08-26 00:02:21.000000,56,965,create_order,2022-08-26 00:02:21.000000,2022-08-26,1
...,...,...,...,...,...,...,...,...
14272,59487,2022-09-08 23:29:06.000000,9,1461,create_order,2022-09-08 23:29:06.000000,2022-09-08,0
14273,59487,2022-09-08 23:29:06.000000,77,1461,create_order,2022-09-08 23:29:06.000000,2022-09-08,0
14274,59487,2022-09-08 23:29:06.000000,62,1461,create_order,2022-09-08 23:29:06.000000,2022-09-08,0
14275,59533,2022-09-08 23:41:24.000000,17,1376,create_order,2022-09-08 23:41:24.000000,2022-09-08,0


In [13]:
#Объединяем датафреймы:

dfr = df.merge(products, on = 'product_id')

In [15]:
dfr['creation_time'] = pd.to_datetime(dfr['creation_time'])
dfr['time'] = pd.to_datetime(dfr['time'])

In [19]:
#Определяем метрики - cредний чек, cреднее время в приложении (в секундах):

metrics = dfr.groupby('group').agg({
    'price': 'mean', 
    'order_id': 'count'
    'time': lambda x: (x.max() - x.min()).seconds / len(x)
}).reset_index()

metrics = metrics.rename(columns={'price': 'avg_revenue', 'order_id': 'num_purchases', 'time': 'avg_session_time'})

# Вычисление конверсии

In [100]:
total_users = users.groupby('group')['user_id'].nunique().reset_index(name='total_users')
total_users

Unnamed: 0,group,total_users
0,0,515
1,1,502


In [93]:
purchase_users = users[users['action'] == 'create_order'].groupby('user_id').first().reset_index()

In [107]:
conv = purchase_users.merge(users, on='user_id')
conv = conv.groupby('group_x')['user_id'].count().reset_index(name='purchase_users')
conv.rename(columns={'group_x': 'group'})

Unnamed: 0,group,purchase_users
0,0,1691
1,1,2646


In [109]:
#conversion = conv.merge(total_users, on='group')
#conversion['conversion'] = conversion['purchase_users'] / conversion['total_users']


In [121]:
metrics

Unnamed: 0,group,avg_revenue,num_purchases,avg_session_time,conversion
0,0,113.571726,5567,15.316149,3.283495
1,1,110.097474,8710,9.614696,5.270916


In [123]:
#Конверсия

cvr = metrics.loc[:,['group', 'conversion']]
cvr

Unnamed: 0,group,conversion
0,0,3.283495
1,1,5.270916


In [134]:
# Количество покупок:

np = metrics.loc[:,['group', 'num_purchases']]
np

Unnamed: 0,group,num_purchases
0,0,5567
1,1,8710


# Результаты

Cтатистически значимые различия в группах зафиксированы в метрике "Время в приложении".

Значимых различий в группах по таким метрикам как "Средний чек", "Конверсия" и "Количество покупок" не найдены.

In [135]:
group_0 = dfr[dfr['group'] == 0]
group_1 = dfr[dfr['group'] == 1]

# Средний чек:
t_stat_revenue, p_val_revenue = ttest_ind(group_0['price'], group_1['price'])


# Среднее время, проведенное в приложении:
t_stat_session_time, p_val_session_time = ttest_ind(group_0.groupby('user_id')['time']\
                                        .apply(lambda x: (x.max() - x.min()).seconds), group_1.groupby('user_id')['time']\
                                        .apply(lambda x: (x.max() - x.min()).seconds))


# Конверсия:
stat, p, dof, expected = chi2_contingency(cvr)


# Количество покупок:
stat_np, p_np, dof_np, expected_np = chi2_contingency(np)




# Результаты:

results = {
    "metric": ["Средний чек", "Конверсия", "Количество покупок", "Время в приложении"],
    "t_stat": [t_stat_revenue, stat, stat_np, t_stat_session_time],
    "p_value": [p_val_revenue, p, p_np, p_val_session_time]
}

results_df = pd.DataFrame(results)
results_df["significant"] = results_df["p_value"] < 0.05
results_df

Unnamed: 0,metric,t_stat,p_value,significant
0,Средний чек,1.883615,0.059637,False
1,Конверсия,0.0,1.0,False
2,Количество покупок,0.0,1.0,False
3,Время в приложении,-2.120601,0.034197,True
