In [None]:
# Import needed libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Read and observe the data
data = pd.read_csv('marketing_AB.csv')
# Observe the first 20 rows of the data
print(data.head(20))
# The number of duplicates user id in the data
## ## We want to know if there are issues in the data that need to be resolved
## ## because only unique user id should be in the data for this test
poss_dups = data.duplicated(subset= 'user id').sum()
print('The number of duplicates:{poss_dups}'.format(poss_dups=poss_dups))
# Drop columns not needed for the expierment
data.drop(['Unnamed: 0', 'user id'], axis=1, inplace=True)
print(data.head(25)) # Observe the data after dropping the columns
## Bivariate Analysis Spellcheck

#### Next, we want to look at the conversion rate of the control and experiment group
data.columns 
## Here we are going to use crosstab to look at the conversion rate of the control and experiment group
## for each independent variable column, test groupm total ads, most ads days and most ads hours.
ct_conversion_test_grp_table = pd.crosstab(data['test group'], data['converted'], normalize='index')
print(ct_conversion_test_grp_table)

ct_conversion_test_grp_table.plot(kind='bar', stacked=True)
######  A cross-tabulation is a two-dimensional table that records the number (frequency) of respondents that have the specific characteristics described in the cells of the table.
ct_conversion_day_table = pd.crosstab(data['most ads day'], data['converted'], normalize='index')
print(ct_conversion_day_table)

ct_conversion_day_table.plot(kind='bar', stacked=True)
ct_hours_table = pd.crosstab(data['most ads hour'], data['converted'], normalize='index')
print(ct_hours_table)

ct_hours_table.plot(kind='bar', stacked=True)
sns.boxplot(x='converted', y='total ads', data=data[data['total ads'] < 50])
##### The median ads seen by those who DID NOT convert is 10 ads, while those who did convert have a median of 25 ads shown. Notice the minmmum of the converted being about 20 ads showing a rather tight range in the std. 
## Statistical Test
from scipy.stats import chi2_contingency
## p-value, if p-value is less than 0.05, then the difference is significant


# We are iterating through the columns to see if there is a significant difference in the conversion rate

for variable in data.columns:
    alpha=0.05
    if variable != 'converted':
        # Create a contingency table for each of the non-converted columns
        contingecy_table=pd.crosstab(data['converted'], data[variable])
       # print(contingecy_table)
        # Perform the chi2 test
        chi2, p, _, _ = chi2_contingency(contingecy_table)
        # Print the results
        print("\nChi-squared test for {variable} vs converted:".format(variable=variable)) 
        print("Chi2 Statistic: {chi2}".format(chi2=chi2))
        print("P-value: {p}".format(p=p))   
        # Refer to significance for p-value interpretation
        if p < alpha:
            print("The difference is in conversion rates across {variable} significant".format(variable=variable))
        else:
            print("The difference is in conversion rates across {variable} NON-significant".formar(variable=variable))
The p-value helps interpet how likely this test is to not happen.  
    Example: p =.25, there is a 25% chance this test will not happen in real life.
    Note, we want this to be as small possible, 5% chance or less this test will not
    mirror real life occurances.

    Above we extremely small numbers for the p value, therefore making the outcomes out this test signifgant across all variable columns. This means showing the number of ads, the timing of the ads, and type of ads makes a difference.

    Ranking the p-value, we can see which factors are contributing the most.
    0. Total Ads Shown
    1. Most Ads Days
    2. Most Ads Hours
    3. Type of Ad Shown
from scipy.stats import ttest_ind, shapiro, levene, mannwhitneyu

#We are going to double check our assumptions 
# Shapiro-Wilk test for normality
shapiro_stat_true, shapiro_p_value_true = shapiro(data[data['converted'] == True]['total ads'])
shapiro_stat_false, shapiro_p_value_false = shapiro(data[data['converted'] == False]['total ads'])

print("Shapiro-Wilk test for normailty (True group): p-value= {shapiro_p_value_true}".format(shapiro_p_value_true=shapiro_p_value_true))
print("Shapiro-Wilk test for normailty (False group): p-value= {shapiro_p_value_false}".format(shapiro_p_value_false=shapiro_p_value_false))
from scipy.stats import chi2_contingency

# Sample data
observed = [[10, 10, 20], [20, 20, 20]]

chi2, p, dof, expected = chi2_contingency(observed)

print("Chi-square statistic:", chi2)
print("p-value:", p)
print("Degrees of freedom:", dof)
print("Expected table:", expected)
