<a href="https://colab.research.google.com/github/ByungjunKim/CRKMLS/blob/main/%EA%B9%80%EC%9C%A4%EC%8B%9D_%EC%B9%B4%EC%9D%B4%EC%A0%9C%EA%B3%B1%EA%B2%80%EC%A0%95.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 연구자 인구사회학적 정보에 따른 김윤식 인용 여부 카이제곱검정

In [None]:
# Colab 등에서 필요한 경우 주석해제 후 실행
# !git clone https://github.com/ByungjunKim/CRKMLS.git
# cd CRKMLS

In [None]:
# 필요시 패키지 설치
# !pip install -U scipy statsmodels pandas

In [None]:
# 필요 패키지 로드
import pandas as pd # Pandas 
from scipy.stats import chi2_contingency # 카이제곱 검정
from statsmodels.stats.multitest import multipletests # 사후 검정
from itertools import combinations
import scipy.stats

In [None]:
df = pd.read_excel('./data/한국현대문학_연구대상_논문.xlsx')
df

In [None]:
# 김윤식 인용 여부 (인용하면 1, 아니면 0)
df['김윤식_인용'].value_counts()

### 1. 성별

In [None]:
# 성별 빈도
df['gender'].value_counts()

In [None]:
# Continggency Table 생성
gender_kim = pd.crosstab(df['gender'],df['김윤식_인용'])
gender_kim.columns = ['김윤식_비인용','김윤식_인용']
gender_kim

In [None]:
# 카이제곱 검정
# 참고 : https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html
chi2_contingency(gender_kim)

### 2. 최종 졸업학교

In [None]:
# 최종 졸업학교 비중
df['grad'].value_counts(normalize=True)

In [None]:
# 서울대 졸업하면 1, 아니면 0
df.loc[df['grad']=='서울대학교','snu'] = 1
df.loc[df['grad']!='서울대학교','snu'] = 0

In [None]:
# 서울대 졸업 vs 비졸업
df['snu'].value_counts()

In [None]:
# Continggency Table 생성
snu = pd.crosstab(df['snu'],df['김윤식_인용'])
snu.index = ['비서울대','서울대']
snu.columns = ['김윤식_비인용','김윤식_인용']
snu

In [None]:
chi2_contingency(snu)

### 3. 세대

In [None]:
# 생년(세대) 정보 변수화
df.loc[~pd.isna(df['birth']),'birth'] = df[~pd.isna(df['birth'])]['birth'].map(lambda x:1900 + int(x))
df['birth']

In [None]:
# 생년 세대
df.loc[~pd.isna(df['birth']),'birth_gen'] = \
pd.cut(df.loc[~pd.isna(df['birth']),'birth'],bins=[1900,1940,1950,1960,1970,1980,1990,2000],labels=['40년생 이전','40년대생','50년대생','60년대생','70년대생','80년대생','90년생 이후'],right=False)

In [None]:
df['birth_gen'].value_counts()

In [None]:
# Continggency Table 생성
birth = pd.crosstab(df['birth_gen'],df['김윤식_인용'])
birth = birth[1:]
birth.columns = ['김윤식_비인용','김윤식_인용']
birth

In [None]:
chi2_contingency(birth)

In [None]:
# 2개씩 짝지어 사후검정(Bonferroni)
# 참고 : https://www.kaggle.com/code/satyads/anova-tukey-chi-square-bonferroni-pearson-lasso/notebook#Chi-Square-Test-of-Significance
# Store p-values of each pair of month
p_vals_chi = []
pairs_of_birth = list(combinations(birth.index.tolist(),2))

#For Each Pair of Months compute Chi Square Stats
for each_pair in pairs_of_birth:
    each_df = df[(df['birth_gen']==each_pair[0]) | (df['birth_gen']==each_pair[1])]
    p_vals_chi.append(\
          scipy.stats.chi2_contingency(
            pd.crosstab(each_df['birth_gen'], each_df['김윤식_인용']))[1]
         )
         
#Results of Bonferroni Adjustment
bonferroni_results = pd.DataFrame(columns=['pair of months',\
                                           'original p value',\
                                           'corrected p value',\
                                           'Reject Null?'])

bonferroni_results['pair of months'] = pairs_of_birth
bonferroni_results['original p value'] = p_vals_chi

#Perform Bonferroni on the p-values and get the reject/fail to reject Null Hypothesis result.
multi_test_results_bonferroni = multipletests(p_vals_chi, method='bonferroni',alpha=0.01)

bonferroni_results['corrected p value'] = multi_test_results_bonferroni[1]
bonferroni_results['Reject Null?'] = multi_test_results_bonferroni[0]
bonferroni_results