# Import libs

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import warnings

from statsmodels.stats.proportion import proportions_ztest, proportion_confint

In [None]:
warnings.filterwarnings("ignore")

# Load and Prepare Data

In [None]:
ab_df = pd.read_csv('data/ab_data.csv')

In [None]:
control_df = ab_df[ab_df['group']=='control'][['converted']].reset_index().drop('index', axis=1)
experiment_df = ab_df[ab_df['group']=='treatment'][['converted']].reset_index().drop('index', axis=1)

# Look at Data

In [None]:
ab_df.head()

In [None]:
ab_df['group'].value_counts()

# There are some data problems in your data

- This will impact your A/B Testing so you should clean it

In [None]:
# remove duplicated user_ids
session_counts = # write your code
users_to_drop = # write your code

ab_df = ab_df[~ab_df['user_id'].isin(users_to_drop)]
print(f'The updated dataset now has {ab_df.shape[0]} entries')

# Take a random sample

- Sample size should be equal for each group. Have 4720 samples in each group.
- For sampling use as a random number generator (to allow reproducibility) equal to 22

- Read about sampling method from Pandas:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sample.html

In [None]:
ab_df.sample?

In [None]:
control_sample = # write your code
treatment_sample = # write your code

ab_test_df = pd.concat([control_sample, treatment_sample], axis=0)
ab_test_df.reset_index(drop=True, inplace=True)

In [None]:
ab_test_df.head()

In [None]:
ab_test_df['group'].value_counts()

In [None]:
plt.figure(figsize=(8,6))

sns.barplot(x=ab_test_df['group'], y=ab_test_df['converted'], ci=False)

plt.ylim(0, 0.17)
plt.title('Conversion rate by group', pad=20)
plt.xlabel('Group', labelpad=15)
plt.ylabel('Converted (proportion)', labelpad=15);

# Basic Statistics: Is Experiment better or worse than Control?

1. Start by calculating: mean, standard deviation and standard error (SE) for each group

**Tips**
Learn more about python library scipy, in particular **stats**

*import scipy.stats as stats*

- Signature: **stats.sem**(a, axis=0, ddof=1, nan_policy='propagate')
    - Docstring: Compute standard error of the mean.
    - degrees of freedom should be Zero

In [None]:
stats.sem?

In [None]:
# Possible solution
conversion_rates = ab_test_df.groupby('group')['converted']

std_p = lambda x: np.std(x, ddof=0)              # Std. deviation of the proportion
se_p = lambda x: stats.sem(x, ddof=0)  

conversion_rates = conversion_rates.agg({'conversion_rate':np.mean, 'std_deviation':std_p, 'std_error':se_p})
conversion_rates.columns = ['conversion_rate', 'std_deviation', 'std_error']

In [None]:
conversion_rates.style.format('{:.3f}')

### What can you conclude from those results?

    - Write down your comments here

# 4. Testing the hypothesis

- Explore library:
    - from statsmodels.stats.proportion import proportions_ztest, proportion_confint
    - see documentation: https://www.statsmodels.org/stable/generated/statsmodels.stats.proportion.proportion_confint.html

In [None]:
control_results = # write your code
treatment_results = # write your code

total_control = control_results.count()
total_treatment = treatment_results.count()

total_successes_for_each = [control_results.sum(), treatment_results.sum()]
total_observations = [total_control, total_treatment]

- Learn about what methods: proportions_ztest and proportion_confint return

- Run proportions_ztest method and proportion_confint with your data
    - significance level should be 0.05

In [None]:
z_stat, pval = # write your code

(lower_con, lower_treat), (upper_con, upper_treat) = # write your code

In [None]:
print(f'z statistic: {z_stat:.2f}')
print(f'p-value: {pval:.3f}')
print(f'ci 95% for control group: [{lower_con:.3f}, {upper_con:.3f}]')
print(f'ci 95% for treatment group: [{lower_treat:.3f}, {upper_treat:.3f}]')

# Conclusions

Write your conclusions