# Z TEST

In [194]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 150)


In [195]:
tips = sns.load_dataset('tips')
tips.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


## Analisis data

In [196]:
# Create weekday from tips dataset
tips['weekday_weekend'] = np.where(tips['day'].isin(['Sat', 'Sun']), 'Weekend', 'Weekdays')

# Create tip group apakah diatas rata rata dan dibawah rata rata
tips['tip_given'] = np.where(tips['tip'] > tips['tip'].mean(), 'Generous', 'Stingy')

# Create portion size apakah size diatas rata rata dan dibawah rata rata
tips['portion_size'] = np.where(tips['size'] > tips['size'].mean(), 'Big', 'Small')
tips

N = len(tips)
e = 0.05
n = round(N/(1+(N*(e**2))))

tipsSample = tips.sample(n)
display(n, tipsSample.head())

152

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,weekday_weekend,tip_given,portion_size
142,41.19,5.0,Male,No,Thur,Lunch,5,Weekdays,Generous,Big
242,17.82,1.75,Male,No,Sat,Dinner,2,Weekend,Stingy,Small
106,20.49,4.06,Male,Yes,Sat,Dinner,2,Weekend,Generous,Small
147,11.87,1.63,Female,No,Thur,Lunch,2,Weekdays,Stingy,Small
53,9.94,1.56,Male,No,Sun,Dinner,2,Weekend,Stingy,Small


## Cek proporsi dari masing masing kolom pada populasi

In [197]:
# Cek proporsi dari masing masing kolom pada populasi
for i in ['sex', 'smoker', 'time', 'weekday_weekend', 'tip_given', 'portion_size']:
    display(pd.DataFrame(tips[i].value_counts(normalize=True)*100))

Unnamed: 0_level_0,proportion
sex,Unnamed: 1_level_1
Male,64.344262
Female,35.655738


Unnamed: 0_level_0,proportion
smoker,Unnamed: 1_level_1
No,61.885246
Yes,38.114754


Unnamed: 0_level_0,proportion
time,Unnamed: 1_level_1
Dinner,72.131148
Lunch,27.868852


Unnamed: 0_level_0,proportion
weekday_weekend,Unnamed: 1_level_1
Weekend,66.803279
Weekdays,33.196721


Unnamed: 0_level_0,proportion
tip_given,Unnamed: 1_level_1
Stingy,50.409836
Generous,49.590164


Unnamed: 0_level_0,proportion
portion_size,Unnamed: 1_level_1
Small,65.57377
Big,34.42623


## Sampling

In [198]:
# Sampling


In [199]:
# cek apakah banyak sample sudah cukup besar ? p_0*n > 15
0.27 * 152 

41.040000000000006

## Z-Test Single Population Proportion

`Asumsi` : Menguji apakah proporsi customer `Male` = 63%? 

`Hipotesis`

$H_{0} : p = 63$%

$H_{a} : p > 63$%

`Signifikansi`/ Tingkat Resiko ($\alpha$)

$\alpha = 0.05$

Pengambilan kesimpulan :

- p-value $\leq \alpha$ : Reject $H_{0}$
- p-value $> \alpha$ : Failed to Reject $H_{0}$ / Accept $H_{0}$

In [200]:
from statsmodels.stats.proportion import proportions_ztest
phat = tipsSample['sex'].value_counts(normalize=True).reset_index()['proportion'][0]

z_stat, p_value = proportions_ztest(phat*n, n, 0.63, alternative='larger')

if p_value < 0.05:
    print(f"P-value/2 = {p_value}, karena P-value <= 0.05, maka kita bisa menolak H0 (Kita punya cukup bukti untuk menolak H0 / p > 63%)")
else:
    print(f"P-value = {p_value}, karena P-value > 0.05, maka kita gagal bisa menolak H0 (Kita punya cukup bukti untuk menolak H0 / p <= 63%)")


P-value = 0.2342526561228287, karena P-value > 0.05, maka kita gagal bisa menolak H0 (Kita punya cukup bukti untuk menolak H0 / p <= 63%)


In [201]:
# Lakukan uji proporsi yang sama untuk menduga 
# 1. Proporsi customer tidak perokok = 59%
# 2. Proporsi customer dinner = 70%
# 3. Proporsi customer weekend = 65%
# 4. Proporsi customer stingy = 50%
# 5. Proporsi customer small portion size = 60%



`Asumsi` : customer tidak perokok = 59%

`Hipotesis`

$H_{0} : p = 59%$%

$H_{a} : p \neq 59%$%

`Signifikansi`/ Tingkat Resiko ($\alpha$)

$\alpha = 0.05$

Pengambilan kesimpulan :

- p-value $\leq \alpha$ : Reject $H_{0}$
- p-value $> \alpha$ : Failed to Reject $H_{0}$ / Accept $H_{0}$

In [202]:
from statsmodels.stats.proportion import proportions_ztest
phat = tipsSample['smoker'].value_counts(normalize=True).reset_index()['proportion'][0]

z_stat, p_value = proportions_ztest(phat*n, n, 0.59, alternative='two-sided')

if p_value < 0.05:
    print(f"P-value = {p_value}, karena P-value <= 0.05, maka kita bisa menolak H0 (Kita punya cukup bukti untuk menolak H0 / p ≠  59%)")
else:
    print(f"P-value = {p_value}, karena P-value > 0.05, maka kita gagal menolak H0 (Kita tidak punya cukup bukti untuk menolak H0 / p = 59%)")


P-value = 0.0776627646173102, karena P-value > 0.05, maka kita gagal menolak H0 (Kita tidak punya cukup bukti untuk menolak H0 / p = 59%)


In [203]:
from statsmodels.stats.proportion import proportions_ztest
phat = tipsSample['time'].value_counts(normalize=True).reset_index()['proportion'][0]

z_stat, p_value = proportions_ztest(phat*n, n, 0.70, alternative='two-sided')

if p_value < 0.05:
    print(f"P-value = {p_value}, karena P-value <= 0.05, maka kita bisa menolak H0 (Kita punya cukup bukti untuk menolak H0 / p ≠  70%)")
else:
    print(f"P-value = {p_value}, karena P-value > 0.05, maka kita gagal menolak H0 (Kita tidak punya cukup bukti untuk menolak H0 / p =  70%)")


P-value = 0.5137658500052931, karena P-value > 0.05, maka kita gagal menolak H0 (Kita tidak punya cukup bukti untuk menolak H0 / p =  70%)


In [204]:
from statsmodels.stats.proportion import proportions_ztest
phat = tipsSample['weekday_weekend'].value_counts(normalize=True).reset_index()['proportion'][0]

z_stat, p_value = proportions_ztest(phat*n, n, 0.65, alternative='two-sided')

if p_value < 0.05:
    print(f"P-value = {p_value}, karena P-value <= 0.05, maka kita bisa menolak H0 (Kita punya cukup bukti untuk menolak H0 / p ≠  65%)")
else:
    print(f"P-value = {p_value}, karena P-value > 0.05, maka kita gagal menolak H0 (Kita punya tidak cukup bukti untuk menolak H0 / p = 65%)")


P-value = 0.3642073766710001, karena P-value > 0.05, maka kita gagal menolak H0 (Kita punya tidak cukup bukti untuk menolak H0 / p = 65%)


In [205]:
from statsmodels.stats.proportion import proportions_ztest
phat = tipsSample['portion_size'].value_counts(normalize=True).reset_index()['proportion'][0]

z_stat, p_value = proportions_ztest(phat*n, n, 0.60, alternative='two-sided')

if p_value < 0.05:
    print(f"P-value = {p_value}, karena P-value <= 0.05, maka kita bisa menolak H0 (Kita punya cukup bukti untuk menolak H0 / p ≠  60%)")
else:
    print(f"P-value = {p_value}, karena P-value > 0.05, maka kita gagal menolak H0 (Kita punya tidak cukup bukti untuk menolak H0 / p = 60%)")


P-value = 0.015439339230518411, karena P-value <= 0.05, maka kita bisa menolak H0 (Kita punya cukup bukti untuk menolak H0 / p ≠  60%)


# Chi-Squared

`Implementasi DATASET TIPS`

`Asumsi` : Apakah ada hubungan antara `Sex` dan `Tip_given`

`Hipotesis`

$H_{0} : $ `sex` dan `tip_given` independent

$H_{a} : $ `sex` dan `tip_given` dependent

`Signifikansi`/ Tingkat Resiko ($\alpha$)

$\alpha = 0.05$

Pengambilan kesimpulan :

- p-value $\leq \alpha$ : Reject $H_{0}$
- p-value $> \alpha$ : Failed to Reject $H_{0}$ / Accept $H_{0}$

In [206]:
tips2 = sns.load_dataset('tips')

tips2['weekday_weekend'] = np.where(tips2['day'].isin(['Sat', 'Sun']), 'Weekend', 'Weekdays')

# Create tip group apakah diatas rata rata dan dibawah rata rata
tips2['tip_given'] = np.where(tips2['tip'] > tips2['tip'].mean(), 'Generous', 'Stingy')

# Create portion size apakah size diatas rata rata dan dibawah rata rata
tips2['portion_size'] = np.where(tips2['size'] > tips2['size'].mean(), 'Big', 'Small')
tips2

N = len(tips2)
e = 0.05
n = round(N/(1+(N*(e**2))))

tips2Sample = tips2.sample(n)
display(n, tipsSample.head())

152

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,weekday_weekend,tip_given,portion_size
142,41.19,5.0,Male,No,Thur,Lunch,5,Weekdays,Generous,Big
242,17.82,1.75,Male,No,Sat,Dinner,2,Weekend,Stingy,Small
106,20.49,4.06,Male,Yes,Sat,Dinner,2,Weekend,Generous,Small
147,11.87,1.63,Female,No,Thur,Lunch,2,Weekdays,Stingy,Small
53,9.94,1.56,Male,No,Sun,Dinner,2,Weekend,Stingy,Small


In [207]:
# buat jumlahan orang orang pergender dan tip give
tipsAgg = tips2Sample.groupby(['sex', 'tip_given'])['total_bill'].count().reset_index().rename(columns={'total_bill':'count'})
tipsAggPivot = tipsAgg.pivot(index='sex', columns='tip_given', values='count')
tipsAggPivot


tip_given,Generous,Stingy
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,53,50
Female,23,26


In [208]:
# Pengujian Chi Square
from scipy.stats import chi2_contingency
stat, p, dof, ex = chi2_contingency(tipsAggPivot)
print(f"chi2 = {stat}, p = {p}, dof = {dof}, ex = {ex}")

chi2 = 0.12046760451753516, p = 0.7285278871216065, dof = 1, ex = [[51.5 51.5]
 [24.5 24.5]]


In [209]:
# tampilkan hasil
print(f"Chi-squared Statistic: {stat}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")
print(f"Expected Frequencies: {ex}")

Chi-squared Statistic: 0.12046760451753516
P-value: 0.7285278871216065
Degrees of Freedom: 1
Expected Frequencies: [[51.5 51.5]
 [24.5 24.5]]


`Asumsi` : Apakah ada hubungan antara `time` dan `weekday-weekend`

`Hipotesis`

$H_{0} : $ `time` dan `weekday-weekend` independent

$H_{a} : $ `time` dan `weekday-weekend` dependent

`Signifikansi`/ Tingkat Resiko ($\alpha$)

$\alpha = 0.05$

Pengambilan kesimpulan :

- p-value $\leq \alpha$ : Reject $H_{0}$
- p-value $> \alpha$ : Failed to Reject $H_{0}$ / Accept $H_{0}$

In [210]:
# Soal 2
# laki laki : apakah ada hubungan antara time dan weekday-weekend

tipsAgg = tips2Sample.groupby(['time', 'weekday_weekend']).size().reset_index(name='count')

tipsAggPivot = tips2Sample.groupby(['time', 'weekday_weekend']).size().reset_index(name='count').pivot(index='time', columns='weekday_weekend', values='count')

tipsAggPivot

weekday_weekend,Weekdays,Weekend
time,Unnamed: 1_level_1,Unnamed: 2_level_1
Lunch,40,0
Dinner,9,103


In [211]:
from scipy.stats import chi2_contingency
stat, p, dof, ex = chi2_contingency(tipsAggPivot)
print(f"chi2 = {stat}, p = {p}, dof = {dof}, ex = {ex}")

chi2 = 109.93970392595318, p = 1.0101715766381503e-25, dof = 1, ex = [[12.89473684 27.10526316]
 [36.10526316 75.89473684]]


In [212]:
print(f"Chi-squared Statistic: {stat}")

if p < 0.05:
    print(f"P-value = {p}, karena P-value <= 0.05, maka kita bisa menolak H0 (Kita punya cukup bukti untuk menolak H0 / time dan weekday_weekend dependent)")
else:
    print(f"P-value = {p}, karena P-value > 0.05, maka kita gagal menolak H0 (Kita punya tidak cukup bukti untuk menolak H0 / time dan weekday_weekend independent)")

Chi-squared Statistic: 109.93970392595318
P-value = 1.0101715766381503e-25, karena P-value <= 0.05, maka kita bisa menolak H0 (Kita punya cukup bukti untuk menolak H0 / time dan weekday_weekend dependent)


`Asumsi` : Apakah ada hubungan antara `gender` dan `smoker`

`Hipotesis`

$H_{0} : $ `gender` dan `smoker` independent

$H_{a} : $ `gender` dan `smoker` dependent

`Signifikansi`/ Tingkat Resiko ($\alpha$)

$\alpha = 0.05$

Pengambilan kesimpulan :

- p-value $\leq \alpha$ : Reject $H_{0}$
- p-value $> \alpha$ : Failed to Reject $H_{0}$ / Accept $H_{0}$

In [213]:

# cari apakah ada hubungan antara gender dan smoker

tipsAgg2 = tips2Sample.groupby(['sex', 'smoker']).size().reset_index(name='count')

tipsAgg2Pivot = tips2Sample.groupby(['sex', 'smoker']).size().reset_index(name='count').pivot(index='sex', columns='smoker', values='count')

tipsAgg2Pivot

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,36,67
Female,19,30


In [217]:
from scipy.stats import chi2_contingency
stat, p, dof, ex = chi2_contingency(tipsAgg2Pivot)
print(f"chi2 = {stat}, p = {p}, dof = {dof}, ex = {ex}")

chi2 = 0.07727652475353967, p = 0.7810226797128352, dof = 1, ex = [[37.26973684 65.73026316]
 [17.73026316 31.26973684]]


In [219]:
print(f"Chi-squared Statistic: {stat}")

if p < 0.05:
    print(f"P-value = {p}, karena P-value <= 0.05, maka kita bisa menolak H0 (Kita punya cukup bukti untuk menolak H0 / gender dan smoker dependent)")
else:
    print(f"P-value = {p}, karena P-value > 0.05, maka kita gagal menolak H0 (Kita punya tidak cukup bukti untuk menolak H0 / gender dan smoker independent)")

Chi-squared Statistic: 0.07727652475353967
P-value = 0.7810226797128352, karena P-value > 0.05, maka kita gagal menolak H0 (Kita punya tidak cukup bukti untuk menolak H0 / gender dan smoker independent)
