In [2]:
!pip install pymc3

Collecting pymc3
  Downloading pymc3-3.11.6-py3-none-any.whl.metadata (15 kB)
Collecting deprecat (from pymc3)
  Downloading deprecat-2.1.3-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting fastprogress>=0.2.0 (from pymc3)
  Downloading fastprogress-1.0.3-py3-none-any.whl.metadata (5.6 kB)
Collecting numpy<1.22.2,>=1.15.0 (from pymc3)
  Downloading numpy-1.22.1.zip (11.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.4/11.4 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
INFO: pip is looking at multiple versions of pymc3 to determine which version is compatible with other requirements. This could take a while.
Collecting pymc3
  Downloading pymc3-3.11.5-py3-none-any.whl.metadata (14 kB)
  Downloading pymc3-3.11.4-py3-none-any.whl.metadata (14 kB)
Collecting theano-pymc==1.1.2 (

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import beta
import pymc3 as pm

# Load data
df = pd.read_csv("https://github.com/dustywhite7/Econ8310/raw/master/AssignmentData/cookie_cats.csv")

def bayesian_retention_analysis(retention_type):
    """
    Bayesian analysis of retention rates for 1-day or 7-day retention.

    Parameters:
    retention_type (str): '1' for 1-day retention, '7' for 7-day retention

    Returns:
    dict: Posterior analysis and visualizations
    """
    # Data preparation
    retention_A = df[df['version'] == 'gate_30'][f'retention_{retention_type}']
    retention_B = df[df['version'] == 'gate_40'][f'retention_{retention_type}']
    
    # Summarize data
    count_A, nobs_A = retention_A.sum(), len(retention_A)
    count_B, nobs_B = retention_B.sum(), len(retention_B)

    # Bayesian inference
    with pm.Model() as model:
        # Priors
        p_A = pm.Beta('p_A', alpha=1, beta=1)
        p_B = pm.Beta('p_B', alpha=1, beta=1)

        # Likelihood
        obs_A = pm.Binomial('obs_A', n=nobs_A, p=p_A, observed=count_A)
        obs_B = pm.Binomial('obs_B', n=nobs_B, p=p_B, observed=count_B)

        # Difference in proportions
        diff = pm.Deterministic('difference', p_B - p_A)

        # Posterior sampling
        trace = pm.sample(2000, return_inferencedata=False, random_seed=RANDOM_SEED)

    # Summarize results
    summary = pm.summary(trace, hdi_prob=0.95)
    hdi_A = summary.loc['p_A', ['hdi_2.5%', 'hdi_97.5%']]
    hdi_B = summary.loc['p_B', ['hdi_2.5%', 'hdi_97.5%']]
    hdi_diff = summary.loc['difference', ['hdi_2.5%', 'hdi_97.5%']]
    
    # Posterior probability that B > A
    prob_B_greater_A = (trace['difference'] > 0).mean()

    # Visualization
    plt.figure(figsize=(12, 6))
    
    # Posterior distributions
    plt.subplot(121)
    sns.kdeplot(trace['p_A'], label='Gate 30 (p_A)', color='blue')
    sns.kdeplot(trace['p_B'], label='Gate 40 (p_B)', color='green')
    plt.title(f'{retention_type}-Day Retention Posterior Distributions')
    plt.xlabel('Retention Rate')
    plt.ylabel('Density')
    plt.legend()

    # Difference distribution
    plt.subplot(122)
    sns.kdeplot(trace['difference'], label='p_B - p_A', color='purple')
    plt.axvline(0, color='red', linestyle='--', label='Zero Difference')
    plt.title('Posterior Distribution of Difference')
    plt.xlabel('Difference in Retention Rates')
    plt.ylabel('Density')
    plt.legend()

    plt.tight_layout()
    plt.show()
    
    return {
        'p_A_mean': trace['p_A'].mean(),
        'p_B_mean': trace['p_B'].mean(),
        'hdi_A': tuple(hdi_A),
        'hdi_B': tuple(hdi_B),
        'hdi_diff': tuple(hdi_diff),
        'prob_B_greater_A': prob_B_greater_A
    }

# Analyze 1-day and 7-day retention
results_1day_bayes = bayesian_retention_analysis('1')
results_7day_bayes = bayesian_retention_analysis('7')

# Print results
def print_bayesian_results(results, retention_type):
    print(f"\n{retention_type}-Day Bayesian Retention Analysis:")
    print(f"Gate 30 Retention Rate (mean): {results['p_A_mean']:.4f}")
    print(f"Gate 30 95% HDI: {results['hdi_A']}")
    print(f"Gate 40 Retention Rate (mean): {results['p_B_mean']:.4f}")
    print(f"Gate 40 95% HDI: {results['hdi_B']}")
    print(f"95% HDI of Difference (p_B - p_A): {results['hdi_diff']}")
    print(f"Posterior Probability Gate 40 > Gate 30: {results['prob_B_greater_A']:.2%}")

print_bayesian_results(results_1day_bayes, '1')
print_bayesian_results(results_7day_bayes, '7')


  self.ctor = getattr(np, o_type.dtype)


AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations