In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

In [2]:
df = pd.read_csv('./data/welch.csv', header=None)
df = df.T
df.columns = df.iloc[0]
df = df.drop(df.index[0])
df

Unnamed: 0,Hokkaido_Tohoku,Kanto,Chubu,Kinki,Chugoku,Shikoku,Kyushu_Okinawa
1,2.21,2.68,2.77,2.59,2.71,2.52,2.35
2,2.61,2.65,2.79,2.69,2.66,2.49,2.8
3,2.69,2.61,2.58,2.31,2.52,2.37,2.57
4,2.56,2.5,2.86,2.28,2.36,2.3,2.57
5,2.71,2.44,2.58,2.44,2.36,,2.41
6,2.94,2.03,2.66,2.63,,,2.4
7,2.76,2.33,2.78,2.5,,,2.27
8,,,2.65,,,,2.63
9,,,2.49,,,,


In [4]:
data = {}
for col in df.columns:
    data[col] = df[col].values
    tmp = data[col].astype(np.float32)
    mask = ~np.isnan(tmp)
    data[col] = tmp[mask]    
data

{'Hokkaido_Tohoku': array([2.21, 2.61, 2.69, 2.56, 2.71, 2.94, 2.76], dtype=float32),
 'Kanto': array([2.68, 2.65, 2.61, 2.5 , 2.44, 2.03, 2.33], dtype=float32),
 'Chubu': array([2.77, 2.79, 2.58, 2.86, 2.58, 2.66, 2.78, 2.65, 2.49],
       dtype=float32),
 'Kinki': array([2.59, 2.69, 2.31, 2.28, 2.44, 2.63, 2.5 ], dtype=float32),
 'Chugoku': array([2.71, 2.66, 2.52, 2.36, 2.36], dtype=float32),
 'Shikoku': array([2.52, 2.49, 2.37, 2.3 ], dtype=float32),
 'Kyushu_Okinawa': array([2.35, 2.8 , 2.57, 2.57, 2.41, 2.4 , 2.27, 2.63], dtype=float32)}

### 分散が未知で等しい場合　ｔ検定

In [5]:
def t_test(x,y,alpha=0.05):
    #片側検定（両側の場合はalpha/2を入れる）
    #不偏分散
    vx = x.var(ddof=1) #ddof=0で標本分散
    vy = y.var(ddof=1)
    m = x.shape[0]
    n = y.shape[0]
    #プールした分散
    var = (m*vx + n*vy) / (m + n -2)
    #t値
    t_value = abs(x.mean()-y.mean()) / (1/m + 1/n)**(1/2) / var
    t_alpha = st.t.ppf(1-alpha,20)
    #検定結果 0：帰無仮説を受容 ,1：帰無仮説を棄却
    if t_value > t_alpha:
        result = 1#
    else:
        result = 0
    return t_value, t_alpha, result

In [6]:
x = data["Kanto"]
y = data["Chubu"]
t_test(x,y,alpha=0.05)

(12.377595450992377, 1.7247182429207857, 1)

### 分散が未知で異なる場合 welchの検定

In [7]:
def welchs_test(x,y,alpha=0.05):
    #片側検定（両側の場合はalpha/2を入れる）
    #不偏分散
    vx = x.var(ddof=1) #ddof=0で標本分散
    vy = y.var(ddof=1)
    m = x.shape[0]
    n = y.shape[0]
    #自由度
    gx = vx / m
    gy = vy / n
    f = (gx + gy)**2 / (gx**2/(m-1) + gy**2/(n-1))
    #t値
    t_value = abs(x.mean()-y.mean()) / (vx**2/m + vy**2/n)**(1/2)
    t_alpha = st.t.ppf(1-alpha,f)
    #検定結果 0：帰無仮説を受容 ,1：帰無仮説を棄却
    if t_value > t_alpha:
        result = 1
    else:
        result = 0
    return t_value, t_alpha, result

In [8]:
x = data["Kanto"]
y = data["Chubu"]
welchs_test(x,y,alpha=0.05)

(10.968809890137853, 1.8409971867531796, 1)

### 分散が等しいかどうかの判断基準（母分散の比の検定）2級p153

In [9]:
def tests_for_homogeneity_of_variance(x,y,alpha=0.05):
    #両側検定
    vx = x.var(ddof=1)
    vy = y.var(ddof=1)
    m = x.shape[0]
    n = y.shape[0]
    f = vx /vy
    f_alpha_low = st.f.ppf(alpha/2, m-1, n-1)
    f_alpha_up = st.f.ppf(1-alpha/2, m-1, n-1)
    #検定結果 0：帰無仮説を受容 ,1：帰無仮説を棄却
    if f >= f_alpha_low and f <= f_alpha_up:
        result = 0
    else:
        result = 1
    return f, f_alpha_low, f_alpha_up, result    

In [11]:
x = data["Kanto"]
y = data["Chubu"]
tests_for_homogeneity_of_variance(x, y, alpha=0.05)

(3.4578228, 0.17858345090364028, 4.651695537300463, 0)

### メモ

In [13]:
st.ttest_ind(x, y)

Ttest_indResult(statistic=-2.5072662252652482, pvalue=0.025111562671710143)

### メモ：welchの検定を使うかどうかでどれくらい結果に違いがでるのかも確認したい