## Tests on Mean
---

In [33]:
import pandas as pd
import numpy as np
from scipy import stats 
import statsmodels.api as sm
from scipy.stats import chi2, f, norm, t

In [34]:
# Import data in a data frame
df = pd.read_csv(r"C:\Users\bahad\Desktop\Courses\Data Analysis\Advertising_Data.csv")
df.head()

Unnamed: 0,TV,Billboards,Google_Ads,Social_Media,Influencer_Marketing,Affiliate_Marketing,Product_Sold
0,281.42,538.8,123.94,349.3,242.77,910.1,7164.0
1,702.97,296.53,558.13,180.55,781.06,132.43,5055.0
2,313.14,295.94,642.96,505.71,438.91,464.23,6154.0
3,898.52,61.27,548.73,240.93,278.96,432.27,5480.0
4,766.52,550.72,651.91,666.33,396.33,841.93,9669.0


## 1 sample T test
---

### Q: *" 1 sample T test : Is average product sales equal to 7000 because of advertisement?"*


In [35]:
#mean on product sales (mu0) = 7000

mu0 = 7000

# Level of significance
a = 0.05

# Number of values
n = len(df)

# Sample mean
tmp =  df.mean()
ybar= tmp['Product_Sold'] #7031

# Sample std
tmp = df.std(ddof=1)
Sy = tmp['Product_Sold']  #1703

print(f'Data set composed of n = {n} values')
print(f'Sample mean = {ybar:.2f}, Sample std = {Sy:.2f}')

Data set composed of n = 300 values
Sample mean = 7031.52, Sample std = 1703.61


In [36]:
# Compute value of Test Statistoc (TS)
tbar = (ybar - mu0) / (Sy/np.sqrt(n))

# Compute p-value = 2*P(T>|tbar|)
pvalue = 2*(1 - t.cdf(np.abs(tbar), df=n-1))

In [37]:
# Print results
print(f' TS value     T = {tbar:.3f}')
print(f'        p-value = {pvalue:.3f}')
print(f'sig. lev. alpha = {a:.3f}')

 TS value     T = 0.320
        p-value = 0.749
sig. lev. alpha = 0.050


#### Q: *"Is average product sold equal to 7000 units?"*
#### A: *"We failed to reject our null hypothesis(H_0), at level of significance $\alpha = 0.05$ (p-value=0.749), Hence average product sold is 7000 units" *

## 2 sample T test
---

### Q: *" 2 Sample T Test: Is average amount spend on  of Social Media Advertisement is same as that of TV?"*
---

In [38]:
# Compute the correlation matrix
correlation = df[['TV', 'Social_Media']].corr()
print(correlation)

                    TV  Social_Media
TV            1.000000     -0.038993
Social_Media -0.038993      1.000000


#### we can conclude that TV and Social_Media are not correlated.
#### We have 300 data, hence as per Central limit theorem, TV and Social Media are independent. 
____
#### So we can run the 2 sample T test on these two.

In [39]:
# H0: mean of social media (mu_sm) = mean of tv (mu_tv)

# Level of significance
a = 0.05

# Sample mean
tmp = df.mean()
sm_bar = tmp['Social_Media']
tv_bar = tmp['TV']

# Sample std
tmp = df.std(ddof=1)
Std_sm = tmp['Social_Media']
Std_tv = tmp['TV']

print(f'Data set composed of n = {n} values')
print(f'Social Media: Sample mean = {sm_bar:.2f}, Sample std = {Std_sm:.2f}')
print(f'TV: Sample mean = {tv_bar:.2f}, Sample std = {Std_tv:.2f}')

# pooled sample var
SV_p = ((n-1)*Std_sm**2 + (n-1)*Std_tv**2) / ((2*n)-2) 

# pooled sample std
Std_p = np.sqrt(SV_p)

Data set composed of n = 300 values
Social Media: Sample mean = 489.80, Sample std = 273.88
TV: Sample mean = 517.43, Sample std = 288.11


In [40]:
# Compute value of Test Statistoc (TS)
tbar2 = (sm_bar - tv_bar) / (Std_p * np.sqrt( 1/n + 1/n))

# Compute p-value = 2*P(T>|tbar|)
pvalue2 = 2*(1 - t.cdf(np.abs(tbar2), df=2*n-2))

print(f'        p-value = {pvalue2}')
print(f'sig. lev. alpha = {a:.3f}')

        p-value = 0.2290984136361578
sig. lev. alpha = 0.050


#### Q: *"Is average amount spend on  of Social Media Advertisement is different from that of TV?"*
#### A: *"We failed to reject our null hypothesis(H_0), at level of significance $\alpha = 0.05$ (p-value=0.229), Hence the average amount spend on  of Social Media Advertisement is same as that of TV? *

## Tests on Variance
---

## 1 Sample Test: Chi Squared Method
---

### Q: *"Is variance of product sold less than 2000000 unit$^2$?"*
#### 1 Sample Test: chi Squared Method
---

In [41]:
# Null hypothesis H0: sigma2 <= 2000000  H1: sigma2 > 2000000

var0 = 2000000

# Level of significance
a = 0.05

# Number of values
n = len(df)

# Sample mean
tmp =  df.mean()
ybar= tmp['Product_Sold'] #7031

# Sample std
tmp = df.var(ddof=1)
Sy = tmp['Product_Sold']  #1703

print(f'Data set composed of n = {n} values')
print(f'Sample mean = {ybar:.2f}, Sample var = {Sy:.2f}')

Data set composed of n = 300 values
Sample mean = 7031.52, Sample var = 2902303.90


In [42]:
# Compute value of Test Statistic (TS)
c = (n - 1) * Sy / var0

# Compute p-value = P(chi2>c) with n-1 degrees of freedom
pvalue = 1 - chi2.cdf(c, df=n-1)

In [43]:
# Print results
print(f'TS value   chi = {c:7.3f}')
print(f'p-value = {pvalue: 7.5f}')
print(f'sig. lev. alpha = {a:7.3f}')

TS value   chi = 433.894
p-value =  0.00000
sig. lev. alpha =   0.050


#### *" Hence, we reject Null Hypothesis at level of significance $\alpha = 0.05$ and conclude that the variance of product sold is greater than 1000000 unit$^2$?"*

##  2 Sample Chi Squared Test: F-Test 
---

### Q: *"Is variance of amount spent on social_media different from that of TV?"*
---

In [44]:
# H0: Variance of amount spend on social media (varx) = Variance of amount spend on TV (vary) ;
# H1:  Variance of amount spend on social media (varx) != Variance of amount spend on TV (vary)

# Level of significance
a = 0.05

# Number of values:
n = len(df)

tmp = df.var(ddof=1)

# Sample var of Social_Media
sv_sm = tmp['Social_Media']

# Sample var of TV
sv_tv = tmp['TV']

print(f'Data set composed of n = {n}')
print(f'Social_Media: Sample var = {sv_sm:.2f}')
print(f'TV: Sample var = {sv_tv:.2f}')

Data set composed of n = 300
Social_Media: Sample var = 75012.40
TV: Sample var = 83010.13


In [45]:
# Compute value of Test Statistic (TS)
v = sv_sm/sv_tv

# Compute P(F<v)
p = f.cdf(v, n-1, n-1)

# Compute p-value = 2 * min{P(F<v), 1-P(F<v)) with n-1,m-1 degrees of freedom
pvalue = 2*min([p, 1-p])

In [46]:
# Print results
print(f'TS value : T = {v:.3f}')
print(f'p-value = {pvalue:.6f}')
print(f'sig. lev. alpha = {a:.3f}')

TS value : T = 0.904
p-value = 0.381584
sig. lev. alpha = 0.050


#### Q: *"Is variance of amount spent on social_media different from that of TV??"*
#### A: *"Hence, we fail to reject Null Hypthesis at level of significance $\alpha = 0.05$ (p-value=0.381584)"*
#### Interpretation: The variance of amount spent on social media is same as that of TV. 
---

## Proportionality test : Z-test
---

### Q: *"If amount spent on social media > 500 then product sold is at least 7000 " ?*
---

In [47]:
# Ho: (amount spent on social media > 500 ) px <= py  H1: px > py (amount spent on social media < 500 )

# Take only data related to Product_Sold >= 7000

PS_more_than_7000 = df[df['Product_Sold'] >= 7000]

# Take only data related to social media <= 500 within Product_Sold >= 7000

SM_less_than500 = PS_more_than_7000[PS_more_than_7000['Social_Media'] <= 500]

# Take only data related to social media > 500 within Product_Sold >= 7000

SM_more_than500 = PS_more_than_7000[PS_more_than_7000['Social_Media'] > 500]

# Number of Product_Sold>7000 in data set
ps_n = len(PS_more_than_7000)

# Number of Social_Media>500 within Product_Sold>7000 in data set
sm_more_n = len(SM_more_than500)

# Number of Social_Media<500 within Product_Sold>7000 in data set
sm_less_n = len(SM_less_than500)


# Estimate prob of Social_Media>500 with Product_Sold>7000 in data set
xbar = sm_more_n/ps_n


# Estimate prob of Social_Media<500 with Product_Sold>7000 in data set
ybar = sm_less_n/ps_n

# Estimate pooled sample proportion
phat = (sm_more_n*xbar + sm_less_n*ybar) / (sm_more_n+sm_less_n)

print(f'Data set composed of  Social_Media>500 in data set = {sm_more_n} ')
print(f'and   Social_Media<500 in data set = {sm_less_n} ')
print(f'prob Social_Media>500 with Product_Sold>7000 : xbar = {xbar:.3f}')
print(f'prob Social_Media<500 with Product_Sold>7000: ybar = {ybar:.3f}')

Data set composed of  Social_Media>500 in data set = 97 
and   Social_Media<500 in data set = 59 
prob Social_Media>500 with Product_Sold>7000 : xbar = 0.622
prob Social_Media<500 with Product_Sold>7000: ybar = 0.378


In [48]:
# Compute value of Test Statistic (TS)
c = (xbar - ybar)/np.sqrt(phat * (1-phat) * (1/sm_more_n + 1/sm_less_n)) 

# Compute p-value = P(Z > c)
pvalue3 = 2*(1 - norm.cdf(np.abs(c)))

# Print results
print(f'TS value : Z = {c:.3f}')
print(f'p-value = {pvalue3:.5f}')
print(f'sig. lev. alpha = {a:.3f}')

TS value : Z = 2.956
p-value = 0.00312
sig. lev. alpha = 0.050


### Q: *"If amount spent on social media > 500 then product sold is at least 7000?"*
### A: *"No, We reject Null Hypothesis at level of significance $\alpha = 0.05$ (p-value=0.0.00312)"*
### Interpretation: The data allow us to conclude that if we spend more than 500 in social media, it doesn't mean that product sold will be more than 7000.
---