# Import Library

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as st

# Read Dataset

In [8]:
df = pd.read_csv('e-commerce_example_dataset.csv')

In [9]:
df.shape

(100, 3)

In [10]:
df

Unnamed: 0,gmv,discount,user_id
0,1324911,not-discount,1
1,1625874,not-discount,2
2,1432984,not-discount,3
3,1294427,not-discount,4
4,1635693,not-discount,5
...,...,...,...
95,1477384,discount,96
96,1684541,discount,97
97,1087507,discount,98
98,1958701,discount,99


In [11]:
df['discount'].value_counts()

discount        65
not-discount    35
Name: discount, dtype: int64

In [12]:
df.head()

Unnamed: 0,gmv,discount,user_id
0,1324911,not-discount,1
1,1625874,not-discount,2
2,1432984,not-discount,3
3,1294427,not-discount,4
4,1635693,not-discount,5


# Langkah 1

Apakah rata rata gmv discount > gmv non-discount

H0 : rata-rata gmv discount **sama dengan** gmv non-discount <br>
H1 : rata-rata gmv discount **berbeda signifikan** gmv non-discount

In [15]:
# kita pisahkan data discount dan non-discount
discount = df[df['discount']=='discount']
not_discount = df[df['discount']!='discount']

In [16]:
discount.head()

Unnamed: 0,gmv,discount,user_id
35,1915181,discount,36
36,1481660,discount,37
37,1517240,discount,38
38,1705569,discount,39
39,1571139,discount,40


In [17]:
not_discount.head()

Unnamed: 0,gmv,discount,user_id
0,1324911,not-discount,1
1,1625874,not-discount,2
2,1432984,not-discount,3
3,1294427,not-discount,4
4,1635693,not-discount,5


# Langkah 2

In [18]:
discount['gmv'].mean()

1624246.553846154

In [19]:
not_discount['gmv'].mean()

1400420.4

In [20]:
# discount > non_discount

# Langkah 3

In [22]:
ttest = st.ttest_ind(a = discount['gmv'], b=not_discount['gmv'])
p_value = ttest.pvalue
print('P-Value :',p_value)

P-Value : 0.0009942306670295464


# Langkah 4

In [25]:
if p_value >= 0.05:
    print('Tidak cukup bukti menyatakan bahwa rata-rata gmv discount sama gmv non-discount')
else:
    print('Cukup bukti menyatakan (Tolak H0) bahwa rata-rata gmv discount berbeda signifikan dari gmv non-discount, maka customer yang diberi discount cenderung lebih besar gmvnya daripada yang non-discount')

Cukup bukti menyatakan (Tolak H0) bahwa rata-rata gmv discount berbeda signifikan dari gmv non-discount, maka customer yang diberi discount cenderung lebih besar gmvnya daripada yang non-discount


## MannWhitneyU Test 
Alternative selain T-test jika asumsi tidak terpenuhi

In [27]:
mann_whitney_test = st.mannwhitneyu(x = discount['gmv'], y=not_discount['gmv'])
pvalue_mwu = mann_whitney_test.pvalue
print('P-Value :',pvalue_mwu)

P-Value : 1.4767329091574464e-05


In [28]:
if pvalue_mwu >= 0.05:
    print('Tidak cukup bukti menyatakan bahwa rata-rata gmv discount sama gmv non-discount')
else:
    print('Cukup bukti menyatakan (Tolak H0) bahwa rata-rata gmv berbeda signifikan dari gmv non-discount, maka customer yang diberi discount cenderung lebih besar gmvnya daripada yang non-discount')

Cukup bukti menyatakan (Tolak H0) bahwa rata-rata gmv berbeda signifikan dari gmv non-discount, maka customer yang diberi discount cenderung lebih besar gmvnya daripada yang non-discount


# Case 2 ANOVA

In [29]:
df2 = pd.read_csv('Iris.csv')
df_versicolor = df2[df2['Species'] == 'Iris-versicolor']
df_setosa = df2[df2['Species'] == 'Iris-setosa']
df_virginica = df2[df2['Species'] == 'Iris-virginica']

In [31]:
st.f_oneway(df_versicolor['SepalWidthCm'],df_setosa['SepalWidthCm'],df_virginica['SepalWidthCm'])

F_onewayResult(statistic=47.36446140299381, pvalue=1.3279165184572242e-16)

In [32]:
df2.groupby(['Species'])['SepalWidthCm'].mean()

Species
Iris-setosa        3.418
Iris-versicolor    2.770
Iris-virginica     2.974
Name: SepalWidthCm, dtype: float64