## 卡方检验

In [34]:
# 导入第三方库
import pandas as pd
import numpy as np
from pprint import pprint
from scipy.stats import chi2_contingency

In [35]:
# 加载数据
excelpath = "excel1_encoding.xlsx"
date_cul = pd.read_excel(excelpath, sheet_name="Sheet1")

分析 **表面风化-玻璃类型、表面风化-纹饰、表面风化-颜色** 这三组的相关性，根据结果判断表面风化和哪个变量

In [36]:
# 首先为了进行卡方检验需要构建列联表
# 对于表面风化-玻璃类型
type_windroses_table = pd.crosstab(date_cul['表面风化'], date_cul['类型'])

# 对于表面风化-纹饰
decoration_windroses_table = pd.crosstab(date_cul['表面风化'], date_cul['纹饰'])

# 对于表面风化-颜色
color_windroses_table = pd.crosstab(date_cul['表面风化'], date_cul['颜色'])

In [51]:
# 进行卡方检验
def perform_chi_square(test_table):
    chi2, p, dof, expected = chi2_contingency(test_table,correction=False)
    pprint(f"卡方值: {chi2:.2f}, p值: {p:.2f}, 自由度: {dof}")
    print(f"期望值:\n{expected}")
    # 检查期望值
    expected_no_suit = expected[expected < 5]  # 期望值小于5的需要进行校正
    pprint(f"{len(expected_no_suit)}个单元格的期望计数小于5")
    print("-"*20)
    return chi2, p, dof,expected

# 执行检验
perform_chi_square(type_windroses_table)
perform_chi_square(decoration_windroses_table)
perform_chi_square(color_windroses_table)

'卡方值: 6.88, p值: 0.01, 自由度: 1'
期望值:
[[16.55172414  7.44827586]
 [23.44827586 10.55172414]]
'0个单元格的期望计数小于5'
--------------------
'卡方值: 4.96, p值: 0.08, 自由度: 2'
期望值:
[[ 9.10344828  2.48275862 12.4137931 ]
 [12.89655172  3.51724138 17.5862069 ]]
'2个单元格的期望计数小于5'
--------------------
'卡方值: 6.86, p值: 0.44, 自由度: 7'
期望值:
[[ 1.24137931  9.10344828  2.89655172  0.82758621  2.48275862  0.4137931
   6.20689655  0.82758621]
 [ 1.75862069 12.89655172  4.10344828  1.17241379  3.51724138  0.5862069
   8.79310345  1.17241379]]
'12个单元格的期望计数小于5'
--------------------


(6.858976317799849,
 0.443709570750047,
 7,
 array([[ 1.24137931,  9.10344828,  2.89655172,  0.82758621,  2.48275862,
          0.4137931 ,  6.20689655,  0.82758621],
        [ 1.75862069, 12.89655172,  4.10344828,  1.17241379,  3.51724138,
          0.5862069 ,  8.79310345,  1.17241379]]))

In [52]:
# 进行Yates校正
def perform_chi_square(test_table):
    chi2, p, dof, expected = chi2_contingency(test_table,correction=True)
    pprint(f"卡方值: {chi2:.2f}, p值: {p:.2f}, 自由度: {dof}")
    return chi2, p, dof,expected

perform_chi_square(decoration_windroses_table)
perform_chi_square(color_windroses_table)

'卡方值: 4.96, p值: 0.08, 自由度: 2'
'卡方值: 6.86, p值: 0.44, 自由度: 7'


(6.858976317799849,
 0.443709570750047,
 7,
 array([[ 1.24137931,  9.10344828,  2.89655172,  0.82758621,  2.48275862,
          0.4137931 ,  6.20689655,  0.82758621],
        [ 1.75862069, 12.89655172,  4.10344828,  1.17241379,  3.51724138,
          0.5862069 ,  8.79310345,  1.17241379]]))