<a href="https://colab.research.google.com/github/ArcWaterCash/statistics/blob/main/01_%E8%A8%98%E8%BF%B0%E7%B5%B1%E8%A8%88%E5%AD%A6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. いろいろな平均

In [1]:
electric_bills = [2200,2700,5800,7500,11500,7300,2700,3100,2500,4300,2200,4100]

## 算術平均
def arithmetic_mean(lst):
    a = 0
    for e in lst:
        a += e
    return a/len(lst)

print('算術平均:', arithmetic_mean(electric_bills))

## 幾何平均
def geometric_mean(lst):
    a = 1
    for e in lst:
        a *= e
    return a**(1/len(lst))

print('算術平均:', geometric_mean(electric_bills))

## 調和平均
def harmonic_mean(lst):
    a = 0
    for e in lst:
        a += (1/e)
    return len(lst)/a

print('調和平均:', harmonic_mean(electric_bills))

算術平均: 4658.333333333333
算術平均: 4016.0866258766096
調和平均: 3551.8060868841694


## 2. データのバラツキ①

In [2]:
electric_bills = [2200,2700,4800,7500,11500,7300,2700,3100,2500,4300,2200,4100]
electric_bills = sorted(electric_bills)

# 分位数
n_quantile = len(electric_bills)//4
i = n_quantile * 1 - 1
quantile_1 = (electric_bills[i] + electric_bills[i+1])/2
print('第一四分位数', quantile_1)

i = n_quantile * 2 - 1
quantile_2 = (electric_bills[i] + electric_bills[i+1])/2
print('第二四分位数（中央値）', quantile_2)

i = n_quantile * 3 - 1
quantile_3 = (electric_bills[i] + electric_bills[i+1])/2
print('第三四分位数', quantile_3)

# 四分位範囲
diff_quantile_3_1 = quantile_3 - quantile_1
print('四分位範囲', diff_quantile_3_1)

# 偏差
def deviation(lst):
    lst_mean = arithmetic_mean(lst)
    deviations = []
    for e in lst:
        deviation = abs(e - lst_mean)
        deviations.append(deviation)
    return deviations
print('偏差:', deviation(electric_bills))

# 分散
def variance(lst):
    lst_mean = arithmetic_mean(lst)
    ret = 0 
    for e in lst:
        ret += (e - lst_mean)**2
    ret /= len(lst)
    return ret
print('分散:', variance(electric_bills))

def standard_deviation(lst):
    return variance(lst) ** (1/2)
print('標準偏差:', standard_deviation(electric_bills))

# 外れ値
electric_bills_deviations = deviation(electric_bills)
for i,e in enumerate(electric_bills_deviations):
    if e == max(electric_bills_deviations):
        print('外れ値:', electric_bills[i])

第一四分位数 2600.0
第二四分位数（中央値） 3600.0
第三四分位数 6050.0
四分位範囲 3450.0
偏差: [2375.0, 2375.0, 2075.0, 1875.0, 1875.0, 1475.0, 475.0, 275.0, 225.0, 2725.0, 2925.0, 6925.0]
分散: 7423541.666666667
標準偏差: 2724.61771018737
外れ値: 11500


## 3. データのバラツキ②

In [3]:
## 変動係数
beef_prices = [256,260,266,269,257,257,266,267,264,266,262,260]
pork_prices = [194,195,195,202,196,193,200,192,191,191,195,196]

print('算術平均', arithmetic_mean(beef_prices), arithmetic_mean(pork_prices))
print('標準偏差', standard_deviation(beef_prices), standard_deviation(pork_prices))

def cv(lst):
    return standard_deviation(lst) / arithmetic_mean(lst)

print('変動係数', cv(beef_prices), cv(pork_prices))

算術平均 262.5 195.0
標準偏差 4.252450274057691 3.1885210782848317
変動係数 0.016199810567838823 0.01635139014505042


## 4. 変数の関連性①

In [4]:
x = [1,2,5,6]
y = [2,1,4,3]
x_mean = arithmetic_mean(x)
y_mean = arithmetic_mean(y)
print('平均:', x_mean, y_mean)

x_mean_diff = [i - x_mean for i in x]
y_mean_diff = [i - y_mean for i in y]
print('平均との差:', x_mean_diff, y_mean_diff)

# ピアソンの積率相関係数
def product_moment_correlation_coefficient(x_list, y_list):
    x_mean = arithmetic_mean(x_list)
    y_mean = arithmetic_mean(y_list)

    x_mean_diff = [i - x_mean for i in x_list]
    y_mean_diff = [i - y_mean for i in y_list]

    molecule = 0
    for x_d, y_d in zip(x_mean_diff, y_mean_diff):
        molecule += x_d * y_d

    def deviation_square_sum_root(lst_mean_diff):
        value_square_sum = 0
        for d in lst_mean_diff:
            value_square_sum += d**2
        return value_square_sum ** (1/2)

    x_deviation_square_sum_root = deviation_square_sum_root(x_mean_diff)
    y_deviation_square_sum_root = deviation_square_sum_root(y_mean_diff)
    denominator = x_deviation_square_sum_root * y_deviation_square_sum_root

    return molecule/denominator

"""
1:正の相関
-1:負の相関
0:相関がない
"""
print('ピアソンの積率相関係数', product_moment_correlation_coefficient(x,y))

平均: 3.5 2.5
平均との差: [-2.5, -1.5, 1.5, 2.5] [-0.5, -1.5, 1.5, 0.5]
ピアソンの積率相関係数 0.7592566023652966


## 5. 変数の関連性②

In [5]:
def rank(lst):
    lst_sorted = sorted(lst)
    rank_lst = []
    for l in lst:
        for i, ls in enumerate(lst_sorted):
            if l == ls:
                rank_lst.append(i)
                break
            else:
                continue
    return rank_lst

def rank_correlation_coefficient(x_list, y_list):
    x_rank_list = rank(x_list)
    y_rank_list = rank(y_list)

    x_mean = arithmetic_mean(x_rank_list)
    y_mean = arithmetic_mean(y_rank_list)

    x_mean_diff = [i - x_mean for i in x_rank_list]
    y_mean_diff = [i - y_mean for i in y_rank_list]

    molecule = 0
    for x_d, y_d in zip(x_mean_diff, y_mean_diff):
        molecule += x_d * y_d

    def deviation_square_sum_root(lst_mean_diff):
        value_square_sum = 0
        for d in lst_mean_diff:
            value_square_sum += d**2
        return value_square_sum ** (1/2)

    x_deviation_square_sum_root = deviation_square_sum_root(x_mean_diff)
    y_deviation_square_sum_root = deviation_square_sum_root(y_mean_diff)
    denominator = x_deviation_square_sum_root * y_deviation_square_sum_root

    return molecule/denominator

print('スピアマンの順位相関係数', rank_correlation_coefficient(x,y))

def rank_correlation_coefficient2(x_list, y_list):
    x_rank_list = rank(x_list)
    y_rank_list = rank(y_list)
    
    n_match = 0
    n_unmatch = 0
    for i in range(len(x_rank_list)-1):
        for j in range(i+1, len(x_rank_list)):
            if (x_rank_list[i] - x_rank_list[j]) * (y_rank_list[i] - y_rank_list[j]) > 0:
                n_match += 1
            else:
                n_unmatch += 1

    return (n_match - n_unmatch) / (1/2 * len(x_rank_list) * (len(x_rank_list) - 1))

print('ケンドールの順位相関係数', rank_correlation_coefficient2(x,y))

スピアマンの順位相関係数 0.5999999999999999
ケンドールの順位相関係数 0.3333333333333333
