<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#SAMPLING-QUESTION-3" data-toc-modified-id="SAMPLING-QUESTION-3-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>SAMPLING QUESTION 3</a></span><ul class="toc-item"><li><span><a href="#Proportional-allocation" data-toc-modified-id="Proportional-allocation-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Proportional allocation</a></span></li><li><span><a href="#Neymann-allocation" data-toc-modified-id="Neymann-allocation-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Neymann allocation</a></span></li><li><span><a href="#Optimal-allocation" data-toc-modified-id="Optimal-allocation-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Optimal allocation</a></span></li></ul></li><li><span><a href="#SAMPLING-QUESTION-4" data-toc-modified-id="SAMPLING-QUESTION-4-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>SAMPLING QUESTION 4</a></span></li><li><span><a href="#SAMPLING-QUESTION-5" data-toc-modified-id="SAMPLING-QUESTION-5-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>SAMPLING QUESTION 5</a></span></li></ul></div>

In [1]:
import pandas as pd
import scipy.stats as stats
import seaborn as sns

In [2]:
data = pd.read_csv("./biostat data.csv")

In [3]:
data.head()

Unnamed: 0,book,review,state,price
0,R Made Easy,Excellent,TX,19.99
1,R For Dummies,Fair,NY,15.99
2,R Made Easy,Excellent,NY,19.99
3,R Made Easy,Poor,FL,19.99
4,Secrets Of R For Advanced Students,Great,Texas,50.0


In [4]:
data.price.mean()

31.287030000000005

In [5]:
def bootstrap_ttest(orig, data: pd.Series, pop_mean, frac = 0.1, boos = 500) -> list:
    
    print("BOOTSTRAPING T TEST".center(55))
    print("-"*55, "\n")
    pop_mean = data.mean()
    info_str = ""
    lst_t = list()
    lst_p = list()
    lst_r = list()
    lst_sm = list()
    lst_sd = list()
    print("No. of BootStraps : ", boos)
    print("Population size   : ", data.count())
    print("Population Mean   : ", round(pop_mean, 5))
    print("Sample Fraction   : ", int(frac*100), "%")
    print("Sample size       : ", int(data.count()*frac))
    print()
    
    for i in range(boos):
        sam = data.sample(frac=frac)
        t_stat, p_val = stats.mannwhitneyu(sam, data)
        lst_sm.append(sam.mean())
        lst_sd.append(sam.std())
        lst_t.append(t_stat)
        lst_p.append(p_val)
        # print(stats.shapiro(sam))
    info_str = info_str + "\033[1m" + "Sam Mean".rjust(10) + "Sam Std".rjust(9) + "T Val".rjust(9) + "   "  +"P Value".rjust(8) + "Result".rjust(9) + "\033[0m"
    for i in range(boos):
        result = "Fail" if lst_p[i] <= 0.05 else "Pass"
        lst_r.append(result)
        info_str = info_str + str( f"{lst_sm[i]:3.4f}").rjust(10) + str(f"{lst_sd[i]:3.4f}").rjust(10) + str(f"{lst_t[i]:3.4f}").rjust(10) + "   " + str(f"{lst_p[i]:3.4f}").rjust(6) + f"{result}".rjust(8)    
        info_str = info_str + "\n" + "\033[1m" + "Result counts" + "\033[0m"
    print(stats.mannwhitneyu(lst_sm, orig))
    print(pd.value_counts(lst_r))
    
    return([list(lst_t), list(lst_p), list(lst_r)])

In [6]:
t_vals, p_vals, r_vals = bootstrap_ttest(orig = data.price, data = data.sample(frac=0.10).price, pop_mean=31.287030000000005, frac=0.1, boos=500)

                  BOOTSTRAPING T TEST                  
------------------------------------------------------- 

No. of BootStraps :  500
Population size   :  200
Population Mean   :  32.65245
Sample Fraction   :  10 %
Sample size       :  20

MannwhitneyuResult(statistic=562840.0, pvalue=1.0917908601248771e-05)
Pass    486
Fail     14
Name: count, dtype: int64


# SAMPLING QUESTION 3

In [7]:
import numpy as np

In [8]:
Ni = np.array([500, 400, 300, 200, 100])
ymean = np.array([80, 100, 120, 150, 200])
Si = np.array([5.25, 10.75, 15.5, 20.5, 25])
Ci = np.array([100, 100, 150, 175, 200])
C0 = 10000
C = 50000
N = sum(Ni)
Wi = Ni/N

In [9]:
Ni

array([500, 400, 300, 200, 100])

In [10]:
Ci

array([100, 100, 150, 175, 200])

## Proportional allocation

In [11]:
Ni*Ci/N

array([33.33333333, 26.66666667, 30.        , 23.33333333, 13.33333333])

In [12]:
sum(Ni*Ci/N)

126.66666666666666

In [13]:
n = np.floor(40000/sum(Ni*Ci/N), )
n = int(n)
n

315

In [14]:
ni = n*Ni/N # sample sizes for proportional allocation
ni

array([105.,  84.,  63.,  42.,  21.])

In [15]:
(Wi[1]**2) * ( (1/ni[1]) - (1/Ni[1]) ) * (Si[1]**2)

0.07728624338624338

In [16]:
var = np.sum([Wi[i]**2*((1/ni[i]) - (1/Ni[i]))*Si[i]**2 for i in range(len(Ni))])
var # Variance

0.465859656084656

## Neymann allocation

In [17]:
# for neymann allocation
Ni*Si

array([2625., 4300., 4650., 4100., 2500.])

In [18]:
sum(Ni*Si)

18175.0

In [19]:
Ni*Si*Ci

array([262500., 430000., 697500., 717500., 500000.])

In [20]:
sum(Ni*Si*Ci)

2607500.0

In [21]:
n = 40000*sum(Ni*Si) / (sum(Ni*Si*Ci))
n

278.8111217641419

In [22]:
ni = n*Ni*Si/sum(Ni*Si) # sample sizes for neymann allocation
ni

array([40.26845638, 65.96356663, 71.33269415, 62.89549377, 38.35091083])

In [23]:
var = np.sum([Wi[i]**2*((1/ni[i]) - (1/Ni[i]))*Si[i]**2 for i in range(len(Ni))])
var # Variance

0.4027340277777778

## Optimal allocation

In [24]:
# for optimal allocation
Ni*Si

array([2625., 4300., 4650., 4100., 2500.])

In [25]:
Ni*Si/np.sqrt(Ci)

array([262.5       , 430.        , 379.67091013, 309.93086787,
       176.7766953 ])

In [26]:
sum(Ni*Si/np.sqrt(Ci))

1558.878473295596

In [27]:
Ni*Si*np.sqrt(Ci)

array([26250.        , 43000.        , 56950.63651971, 54237.90187682,
       35355.33905933])

In [28]:
sum(Ni*Si*np.sqrt(Ci))

215793.87745586038

In [29]:
n = 40000 * sum(Ni*Si/np.sqrt(Ci)) / sum(Ni*Si*np.sqrt(Ci))
n

288.95694199933126

In [30]:
ni = ( n * Ni * Si / np.sqrt(Ci) ) / sum(Ni*Si/np.sqrt(Ci)) # sample sizes for optimum allocation
ni

array([48.6575436 , 79.70569046, 70.37658614, 57.44942749, 32.76769432])

In [31]:
var = np.sum([Wi[i]**2*((1/ni[i]) - (1/Ni[i]))*Si[i]**2 for i in range(len(Ni))])
var # Variance

0.3935749727492765

# SAMPLING QUESTION 4

In [32]:
Ni = [1411, 4705, 2558, 14997]
N = sum(Ni)
n1 = [43,84,98,0,10,44,0,124,13,0]
n2 = [50, 147, 62, 87, 84, 158, 170, 104, 56, 160]
n3 = [228, 262, 110, 232, 139, 178, 334, 0, 63, 220]
n4 = [17, 34, 25, 34, 36, 0, 25, 7, 15, 31]

In [33]:
ymean = np.array([np.mean(n1), np.mean(n2), np.mean(n3), np.mean(n4)])
ymean

array([ 41.6, 107.8, 176.6,  22.4])

In [34]:
sub_pop_tot = Ni*ymean
sub_pop_tot

array([ 58697.6, 507199. , 451742.8, 335932.8])

In [35]:
Ni*ymean**2

array([ 2441820.16      , 54676052.2       , 79777778.47999999,
        7524894.72      ])

In [36]:
ni = np.array([10]*4)

In [37]:
pop_tot = sum(sub_pop_tot)
pop_tot

1353572.2

In [38]:
si = np.array([np.std(i, ddof=1) for i in [n1, n2, n3, n4]])

In [39]:
var_st = sum(np.sqrt(Ni*(Ni - ni)*si**2 / ni ))
var_st

228927.89281328587

In [40]:
ymean

array([ 41.6, 107.8, 176.6,  22.4])

In [41]:
var_srs = (1/50 - 1/N) * N * (  np.sum(Ni*si**2)- np.sum(Ni*si**2/ni) + np.sum(Ni*ymean**2) - np.sum((Ni*ymean))**2  )
var_srs

-865462268663397.9

In [42]:
sd_srs = np.sqrt(var_srs)

  sd_srs = np.sqrt(var_srs)


In [43]:
sd_srs

nan

In [44]:
var_srs/var_st

-3780501615.71736

# SAMPLING QUESTION 5

In [45]:
ques5 = pd.read_excel("./question 5.xlsx")

In [46]:
ques5

Unnamed: 0,01 to 10,11 to 20,21-30,31-40,41-50,51-60,61-70,71-80
0,26,16,27,37,4,36,20,21
1,28,9,20,14,5,20,21,26
2,11,22,25,14,11,43,15,16
3,16,26,39,24,9,27,14,18
4,7,17,24,18,25,20,13,11
5,22,39,25,17,16,21,9,19
6,44,21,18,14,13,18,25,27
7,26,14,44,38,22,19,17,29
8,31,40,55,36,18,27,7,31
9,26,30,39,29,9,30,30,29


In [47]:
alldt = list()

In [48]:
ques5['01 to 10'][9]

26

In [49]:
for j in range(10):
    alldt = []
    for i in ques5.columns:
        alldt.append(ques5[i][j])
    print(str(np.mean(alldt)).ljust(10), str((np.mean(alldt) - 22.7375)).ljust(20), str((np.mean(alldt) - 22.7375)**2).ljust(20) )

23.375     0.6374999999999993   0.4064062499999991  
17.875     -4.862500000000001   23.643906250000008  
19.625     -3.1125000000000007  9.687656250000005   
21.625     -1.1125000000000007  1.2376562500000017  
16.875     -5.862500000000001   34.36890625000001   
21.0       -1.7375000000000007  3.0189062500000023  
22.5       -0.2375000000000007  0.05640625000000034 
26.125     3.3874999999999993   11.475156249999996  
30.625     7.887499999999999    62.21265624999999   
27.75      5.012499999999999    25.125156249999993  


In [50]:
alldt = []
for i in ques5.columns:
    alldt.extend(ques5[i])
Ymean = np.mean(alldt)
Ymean

22.7375

In [51]:
N = 80

In [52]:
sam = np.array(ques5.loc[9])
sam

array([26, 30, 39, 29,  9, 30, 30, 29], dtype=int64)

In [53]:
samMean = np.mean(sam)
samMean

27.75

In [54]:
ymean = list()
for i in range(10):
    print(np.mean(ques5.loc[i]))
    ymean.append(np.mean(ques5.loc[i]))
ymean

23.375
17.875
19.625
21.625
16.875
21.0
22.5
26.125
30.625
27.75


[23.375, 17.875, 19.625, 21.625, 16.875, 21.0, 22.5, 26.125, 30.625, 27.75]

In [55]:
ymean = np.array(ymean)
Ybar = np.mean(ymean)
Ybar

22.7375

In [56]:
ymean

array([23.375, 17.875, 19.625, 21.625, 16.875, 21.   , 22.5  , 26.125,
       30.625, 27.75 ])

In [57]:
ymean - Ybar

array([ 0.6375, -4.8625, -3.1125, -1.1125, -5.8625, -1.7375, -0.2375,
        3.3875,  7.8875,  5.0125])

In [58]:
[round(i, 4) for i in (ymean - Ybar)**2]

[0.4064,
 23.6439,
 9.6877,
 1.2377,
 34.3689,
 3.0189,
 0.0564,
 11.4752,
 62.2127,
 25.1252]

In [59]:
q5tb1 = pd.DataFrame({"sm mean" : ymean,
            "sm mean - pop mean" : ymean - Ybar,
              "sm mean - pop mean w sq" : [round(i, 4) for i in (ymean - Ybar)**2]})
q5tb1

Unnamed: 0,sm mean,sm mean - pop mean,sm mean - pop mean w sq
0,23.375,0.6375,0.4064
1,17.875,-4.8625,23.6439
2,19.625,-3.1125,9.6877
3,21.625,-1.1125,1.2377
4,16.875,-5.8625,34.3689
5,21.0,-1.7375,3.0189
6,22.5,-0.2375,0.0564
7,26.125,3.3875,11.4752
8,30.625,7.8875,62.2127
9,27.75,5.0125,25.1252


In [60]:
popTotal = N * samMean
popTotal

2220.0

In [61]:
varSys = 80**2 * (np.mean(q5tb1['sm mean - pop mean w sq']))
varSys

109589.12

In [62]:
seSys = np.sqrt(varSys)
seSys

331.0424746161737

In [63]:
S2 = (1/79)* (np.sum([i**2 for i in alldt]) - np.sum(alldt)**2/80)
S2

104.42389240506333

In [64]:
varSRS = 80**2 * (1/8 - 1/80)*S2
varSRS

75185.2025316456

In [65]:
re = varSRS/varSys*100
re

68.60644791348412

In [66]:
sjlst = list()
for i in range(10):
    sjlst.append(np.var(ques5.loc[i], ddof=1))
    print(np.var(ques5.loc[i], ddof=1))

115.98214285714286
63.839285714285715
113.69642857142857
88.26785714285714
39.267857142857146
75.71428571428571
99.14285714285714
109.55357142857143
205.98214285714286
71.35714285714286


In [67]:
sjlst

[115.98214285714286,
 63.839285714285715,
 113.69642857142857,
 88.26785714285714,
 39.267857142857146,
 75.71428571428571,
 99.14285714285714,
 109.55357142857143,
 205.98214285714286,
 71.35714285714286]

In [68]:
s2swt = np.sum(sjlst)
s2swt

982.8035714285716

In [69]:
k = 8

In [70]:
N**2 * (k - 1) * s2swt/(80)

550370.0000000001

In [71]:
sj = list()
for i in range(10):
    sj.append(np.var(ques5.loc[i], ddof=1))
    print(sj[i])

115.98214285714286
63.839285714285715
113.69642857142857
88.26785714285714
39.267857142857146
75.71428571428571
99.14285714285714
109.55357142857143
205.98214285714286
71.35714285714286


In [72]:
sj = list()
for i in ques5.columns:
    sj.append(np.var(ques5[i], ddof=1))

In [73]:
for i in sj:
    print(i*9)

1002.1
968.4000000000001
1296.4
918.9000000000001
439.6
616.9000000000001
450.90000000000003
398.09999999999997


In [74]:
for i in sj:
    print(i)

111.34444444444445
107.60000000000001
144.04444444444445
102.10000000000001
48.84444444444445
68.54444444444445
50.1
44.23333333333333


In [75]:
np.sum(sj)/8

84.60138888888889

In [76]:
80*9*84.601388


60912.99936

In [77]:
100*60912.99936/109588.98

55.583142903602166