# Import libraries

In [1]:
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# !pip install statsmodels
import statsmodels.stats.api as sms
from scipy.stats import ttest_1samp, shapiro, levene, ttest_ind, mannwhitneyu, \
    pearsonr, spearmanr, kendalltau, f_oneway, kruskal
from statsmodels.stats.proportion import proportions_ztest
import warnings
warnings.filterwarnings("ignore")

In [2]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',10)
pd.set_option('display.float_format',lambda x: '%5f' %x)

# Preparing and Analyzing Data

In [3]:
df_ = pd.read_csv("/kaggle/input/ab-test-data/AB_Test_Results.csv")

In [4]:
df = df_.copy()

In [5]:
df.head()

Unnamed: 0,USER_ID,VARIANT_NAME,REVENUE
0,737,variant,0.0
1,2423,control,0.0
2,9411,control,0.0
3,7311,control,0.0
4,6174,variant,0.0


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
USER_ID,10000.0,4981.0802,2890.590115,2.0,2468.75,4962.0,7511.5,10000.0
REVENUE,10000.0,0.099447,2.318529,0.0,0.0,0.0,0.0,196.01


In [7]:
df.shape

(10000, 3)

In [8]:
control = df.loc[df["VARIANT_NAME"]=="control"]

In [9]:
test = df.loc[df["VARIANT_NAME"]=="variant"]

## Step 1 - Formulate Hypotheses:

* Null Hypothesis (H0): μ1 = μ2 (There is no statistically significant difference in terms of revenue between the Control and Variant Groups)
* Alternative Hypothesis (H1): μ1 ≠ μ2 (There is a statistically significant difference between the Control and Variant Groups in terms of revenue)

## Step 2 - Assumption Checks:

* Normality Assumption
* Homogeneity of Variance

## Step 3 - Normality Assumption Hypothesis:

* H0: The normal distribution assumption is satisfied.
* H1: The normal distribution assumption is violated.

## Step 4 - Assumption of Homogeneity of Variance:

* H0: Variances are homogenous.
* H1: Variances are not homogenous.

## Normality Assumption

### The hypothesis regarding the normality assumption is as follows:

* H0: The normal distribution assumption holds.
* H1: The normal distribution assumption is not met."

In [10]:
test_stat,p_value = shapiro(control["REVENUE"])
print('Test Stat=%.4f,p-value=%.4f'%(test_stat,p_value))

Test Stat=0.0183,p-value=0.0000


### Notes:

### If the p-value is less than 0.05, we reject the null hypothesis (H0)

In [11]:
test_stat,p_value = shapiro(test["REVENUE"])
print('Test Stat=%.4f,p-value=%.4f'%(test_stat,p_value))

Test Stat=0.0270,p-value=0.0000


## Normality Assumption Result 

### H0 has been rejected, indicating that the data does not follow a normal distribution. As a result, there is no need to assess the homogeneity of variances. We can proceed directly to non-parametric statistical analysis, specifically the Mann-Whitney test

### Non-parametric Test

In [12]:
test_stat,pvalue = mannwhitneyu(control["REVENUE"],
                                test['REVENUE'])
print('Test stat = %.4f,p-value=%.4f'%(test_stat,pvalue))

Test stat = 12521564.0000,p-value=0.4783


# Result 

### Null Hypothesis (H0): μ1 = μ2 (There is no statistically significant difference in terms of revenue between the Control and Variant Groups)