In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
import json
import warnings

warnings.filterwarnings('ignore')
sns.set_theme(style="darkgrid")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/initial/notebook/marketing.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2010 entries, 0 to 2009
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   CustomerID           2010 non-null   int64  
 1   Age                  2010 non-null   int64  
 2   Gender               2010 non-null   object 
 3   Income               1931 non-null   float64
 4   CampaignChannel      1856 non-null   object 
 5   CampaignType         2010 non-null   object 
 6   AdSpend              2010 non-null   object 
 7   ClickThroughRate     2010 non-null   float64
 8   ConversionRate       2010 non-null   float64
 9   WebsiteVisits        2010 non-null   int64  
 10  PagesPerVisit        2010 non-null   float64
 11  TimeOnSite           1931 non-null   float64
 12  SocialShares         2010 non-null   int64  
 13  EmailOpens           2010 non-null   int64  
 14  EmailClicks          2010 non-null   int64  
 15  PreviousPurchases    2010 non-null   i

<H1>Statistics

In [None]:
#non_prarametric

def numeric_nonparm_test(test_type="one-sample", *groups, alpha=0.05):
    result = {"test type":test_type, "test": None, "statistic": None, "p-value": None, "decision": None}
    stat, p_value = 0, 0
    if test_type == "one-sample":
        data = np.array(groups[0])
        stat, p_value = stats.wilcoxon(data - np.median(data))
        result["test"] = "Wilcoxon"

    elif test_type == "two-independent":
        if len(groups) != 2:
            raise ValueError("this is only for two groups!")
        group1, group2 = groups
        stat, p_value = stats.mannwhitneyu(group1, group2, alternative="two-sided")
        result["test"] = "Man-Whitney-U"

    elif test_type == "multi-independent":
        if len(groups) < 2:
            raise ValueError("this is for more than two groups!")
        stat, p_value = stats.kruskal(*groups)
        result["test"] = "Kruskal"

    elif test_type == "variance-test":
        if len(groups) < 2:
            raise ValueError("this is for more than two groups!")
        stat, p_value = stats.friedmanchisquare(*groups)
        result["test"] = "Friedman"

    result["statistic"] = stat
    result["p-value"] = p_value

    if p_value < alpha:
      result["decision"] = "Rejected"
    else:
      result["decision"] = "Failed"

    print('='*48)
    print(json.dumps(result, indent=3, sort_keys=False, default=str)) # using json.dumps for prettier printing
    print('='*40)

In [None]:
data = df["Age"].values
numeric_nonparm_test("one-sample", data)

{
   "test type": "one-sample",
   "test": "Wilcoxon",
   "statistic": 955122.5,
   "p-value": 0.9721126195922359,
   "decision": "Failed"
}


In [None]:
data_group1 = df[df["Conversion"] == 1]["Age"].values
data_group2 = df[df["Conversion"] == 0]["Age"].values

numeric_nonparm_test("two-independent", data_group1, data_group2)

{
   "test type": "two-independent",
   "test": "Man-Whitney-U",
   "statistic": 219266.5,
   "p-value": 0.9275457449394919,
   "decision": "Failed"
}


also added multiple if we need it

"multi-independent"  or   "variance-test" for the test_type

In [None]:
#parametric
def numeric_parametric_test(test_type="one-sample", *groups, alpha=0.05):
    result = {"test type":test_type, "test": None, "statistic": None, "p-value": None, "decision": None}
    stat, p_value = 0, 0
    if test_type == "one-sample":
        data = np.array(groups[0])
        stat, p_value = stats.ttest_1samp(data, np.mean(data))
        result["test"] = "Ttest"

    elif test_type == "two-independent":
        if len(groups) != 2:
            raise ValueError("this is only for two groups!")
        group1, group2 = groups
        stat, p_value = stats.ttest_ind(group1, group2, equal_var=True)
        result["test"] = "Ttest"

    elif test_type == "paired":
        if len(groups) != 2:
            raise ValueError("this is only for two groups!")
        group1, group2 = groups
        stat, p_value = stats.ttest_rel(group1, group1)
        result["test"] = "Ttest"

    elif test_type == "non-equal-var":
        if len(groups) != 2:
            raise ValueError("this is only for two groups!")
        group1, group2 = groups
        stat, p_value = stats.ttest_ind(group1, group2, equal_var=False)
        result["test"] = "Welch"

    elif test_type == "multi-independent":
        if len(groups) < 2:
            raise ValueError("this is for more than two groups!")
        stat, p_value = stats.f_oneway(*groups)
        result["test"] = "One-Way ANOVA"

    result["statistic"] = stat
    result["p-value"] = p_value

    if p_value < alpha:
      result["decision"] = "Rejected"
    else:
      result["decision"] = "Failed"

    print('='*48)
    print(json.dumps(result, indent=3, sort_keys=False, default=str)) # using json.dumps for prettier printing
    print('='*40)

In [None]:
data = df["Age"].values
numeric_parametric_test("one-sample", data)

{
   "test type": "one-sample",
   "test": "Ttest",
   "statistic": 0.0,
   "p-value": 1.0,
   "decision": "Failed"
}


In [None]:
data_group1 = df[df["Conversion"] == 1]["Age"].values
data_group2 = df[df["Conversion"] == 0]["Age"].values

numeric_parametric_test("two-independent", data_group1, data_group2)

{
   "test type": "two-independent",
   "test": "Ttest",
   "statistic": 0.09012017542473374,
   "p-value": 0.9282007047314531,
   "decision": "Failed"
}


In [None]:
before = df["TimeOnSite"].values
after  = df['CampaignChannel'].values  #Def check for NaN and SE before this function if needed or it will return p-value as nan

numeric_parametric_test("paired", before, after)

{
   "test type": "paired",
   "test": "Ttest",
   "statistic": NaN,
   "p-value": NaN,
   "decision": "Failed"
}


In [None]:
data_group1 = df[df["Conversion"] == 1]["Age"].values
data_group2 = df[df["Conversion"] == 0]["Age"].values

numeric_parametric_test("non-equal-var", data_group1, data_group2)

{
   "test type": "non-equal-var",
   "test": "Welch",
   "statistic": 0.08777979604930926,
   "p-value": 0.9301074137259193,
   "decision": "Failed"
}


In [None]:
df["CampaignType"].unique()

array(['Consideration', 'Awareness', 'Retention', 'Conversion'],
      dtype=object)

In [None]:
group1 = df[df["CampaignType"] == 'Consideration']["Age"].values
group2 = df[df["CampaignType"] == 'Awareness']["Age"].values
group3 = df[df["CampaignType"] == 'Retention']["Age"].values
groups=[group1, group2, group3]
numeric_parametric_test("multi-independent", *groups)

{
   "test type": "multi-independent",
   "test": "One-Way ANOVA",
   "statistic": 1.867051584635893,
   "p-value": 0.1549456257893048,
   "decision": "Failed"
}
