In [1]:
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.stats.api as sms
from scipy.stats import ttest_1samp, shapiro, levene, ttest_ind, mannwhitneyu, \
    pearsonr, spearmanr, kendalltau, f_oneway, kruskal
from statsmodels.stats.proportion import proportions_ztest
df = sns.load_dataset("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [2]:
df.groupby("day")["total_bill"].mean()

day
Thur    17.682742
Fri     17.151579
Sat     20.441379
Sun     21.410000
Name: total_bill, dtype: float64

In [3]:
# h0 => m0=m1=m2
# normallik varsayımı kontrolü, varyans homojenliği

In [4]:
for day in list(df["day"].unique()):
    test_stat, pvalue = shapiro(df.loc[df["day"]==day, "total_bill"])
    print(day,f"{pvalue:.4f}")
    
# p < 0.05 h0 red normal dağılım değil

Sun 0.0036
Sat 0.0000
Thur 0.0000
Fri 0.0409


In [5]:
for day in list(df["day"].unique()):
    pvalue = shapiro(df.loc[df["day"]==day, "total_bill"])[1]
    print(day,f"{pvalue:.4f}")
    
# p < 0.05 h0 red normal dağılım değil

Sun 0.0036
Sat 0.0000
Thur 0.0000
Fri 0.0409


In [6]:
test_stat,pvalue = levene(df.loc[df["day"]=="Sun","total_bill"],
                          df.loc[df["day"]=="Sat","total_bill"],
                          df.loc[df["day"]=="Thur","total_bill"],
                          df.loc[df["day"]=="Fri","total_bill"])
print(f"p value {pvalue:.4f}")

# p > 0.05 h0 reddedilemez, dağılım homojendir h0 var yani

p value 0.5741


In [7]:
df.groupby("day").agg({"total_bill":["mean","median"]})

Unnamed: 0_level_0,total_bill,total_bill
Unnamed: 0_level_1,mean,median
day,Unnamed: 1_level_2,Unnamed: 2_level_2
Thur,17.682742,16.2
Fri,17.151579,15.38
Sat,20.441379,18.24
Sun,21.41,19.63


In [8]:
#parametrik anova
f_oneway(df.loc[df["day"]=="Thur","total_bill"],
         df.loc[df["day"]=="Fri","total_bill"],
         df.loc[df["day"]=="Sat","total_bill"],
         df.loc[df["day"]=="Sun","total_bill"])

F_onewayResult(statistic=2.7674794432863363, pvalue=0.04245383328951916)

In [9]:
# non-parametrik
kruskal(df.loc[df["day"]=="Thur","total_bill"],
         df.loc[df["day"]=="Fri","total_bill"],
         df.loc[df["day"]=="Sat","total_bill"],
         df.loc[df["day"]=="Sun","total_bill"])
# p < 0.05 h0 red aralarında istatistiksel olarak fark var

KruskalResult(statistic=10.403076391437086, pvalue=0.015433008201041274)

In [10]:
from statsmodels.stats.multicomp import MultiComparison,pairwise_tukeyhsd
comparison = MultiComparison(df["total_bill"],df["day"])
tukey = comparison.tukeyhsd(0.05)
tukey.summary()

group1,group2,meandiff,p-adj,lower,upper,reject
Fri,Sat,3.2898,0.4541,-2.4799,9.0595,False
Fri,Sun,4.2584,0.2371,-1.5856,10.1025,False
Fri,Thur,0.5312,0.9957,-5.4434,6.5057,False
Sat,Sun,0.9686,0.8968,-2.6088,4.546,False
Sat,Thur,-2.7586,0.2374,-6.5455,1.0282,False
Sun,Thur,-3.7273,0.0668,-7.6264,0.1719,False


In [11]:
from statsmodels.stats.multicomp import MultiComparison,pairwise_tukeyhsd
tukey = pairwise_tukeyhsd(df["total_bill"],df["day"])
tukey.summary()

group1,group2,meandiff,p-adj,lower,upper,reject
Fri,Sat,3.2898,0.4541,-2.4799,9.0595,False
Fri,Sun,4.2584,0.2371,-1.5856,10.1025,False
Fri,Thur,0.5312,0.9957,-5.4434,6.5057,False
Sat,Sun,0.9686,0.8968,-2.6088,4.546,False
Sat,Thur,-2.7586,0.2374,-6.5455,1.0282,False
Sun,Thur,-3.7273,0.0668,-7.6264,0.1719,False


In [12]:
test_stat,  pvalue = mannwhitneyu(df.loc[df["day"]=="Sun","total_bill"],
                                  df.loc[df["day"]=="Thur","total_bill"])
print(f"test stat {test_stat:.4f}, pvalue {pvalue:.4f}")

test stat 3007.0000, pvalue 0.0054


In [13]:
test_stat,  pvalue = mannwhitneyu(df.loc[df["day"]=="Sat","total_bill"],
                                  df.loc[df["day"]=="Thur","total_bill"])
print(f"test stat {test_stat:.4f}, pvalue {pvalue:.4f}")

test stat 3213.5000, pvalue 0.0469
