<a href="https://colab.research.google.com/github/ByungjunKim/NationStateChosun/blob/main/%5BColab%5DDBE_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DBE 분석결과 정리

In [None]:
!pip install git+https://github.com/ByungjunKim/dynamic_bernoulli_embeddings.git

In [None]:
!pip install -U gdown

In [None]:
import torch
import pandas as pd
from dynamic_bernoulli_embeddings.analysis import DynamicEmbeddingAnalysis
from dynamic_bernoulli_embeddings.training import train_model
from gensim.corpora import Dictionary
from tqdm import tqdm
from collections import Counter
import math
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from itertools import chain
import gdown
import pickle
!mkdir results # results 폴더 생성

In [None]:
# 학습 완료한 임베딩 모델 다운로드
## 조선일보
# https://drive.google.com/file/d/1UmsDoOrA6ys0ic-xpEbGFBA-NaqHjdLu/view?usp=sharing
gdown.download(id='1UmsDoOrA6ys0ic-xpEbGFBA-NaqHjdLu',output='230222_chosun_emb.pkl')
## 동아일보
# https://drive.google.com/file/d/1UqBehgGa_Ts9sGGLvVq-zw3f4njNNewt/view?usp=sharing
gdown.download(id='1UqBehgGa_Ts9sGGLvVq-zw3f4njNNewt',output='230128_donga_emb.pkl')

In [None]:
with open('./230222_chosun_emb.pkl', 'rb') as f:
    chosun_emb = pickle.load(f)
with open('./230128_donga_emb.pkl', 'rb') as f:
    donga_emb = pickle.load(f)

In [None]:
# 시작년도(min_year)와 기간(max_time) 설정
min_year = 1920
max_time = 20

In [None]:
# 연도별 가장 유사어(고맥락어) 테이블로 추출하기
def time_neigh(emb,target, min_year,max_time, top_n):
    over_time = {}
    for i in range(0, max_time + 1, 1):
        col = str(min_year + i)
        over_time[col] = emb.neighborhood(target, i, top_n)
    return pd.DataFrame(over_time)

In [None]:
# test
time_neigh(chosun_emb,'민족',1920,20,10)

In [None]:
# 타겟단어군(분석대상) 설정
target_words = ['민족', '국민', '국가', '독립', '문화', '문명',\
                '인민', '신민' ,'노동자', '사회', '지주', '경제', '계급', '운동',\
                '청년', '생활', '태도', '문학']

### 1. 조선일보

##### a. 단어별 부침(drift)

In [None]:
chosun_drift = pd.DataFrame(chosun_emb.absolute_drift(n=100000),columns=['drift','단어'])
chosun_drift.to_excel('./results/chosun_drift.xlsx',index=None)

In [None]:
len(chosun_drift)

In [None]:
chosun_drift.describe()

In [None]:
chosun_drift['drift'].hist()

##### b. 주요 단어 맥락 변화

In [None]:
for idx, target_word in tqdm(enumerate(target_words)):
    try:
        df = time_neigh(chosun_emb,target_word,1920,20,100)
    except KeyError:
        print(f'{target_word} 없음')
        continue
    if idx ==0:
        with pd.ExcelWriter(f'./results/chosun_time_neigh.xlsx',mode='w') as writer:
            df.to_excel(writer,sheet_name=target_word,index=None)
    else:
        with pd.ExcelWriter(f'./results/chosun_time_neigh.xlsx',mode='a') as writer:
            df.to_excel(writer,sheet_name=target_word,index=None)

In [None]:
df = time_neigh(chosun_emb,'민족',1920,20,30)

In [None]:
# 1923과 1924 서로 다른 단어 (1923에만 존재)
print(set(df['1923'].map(lambda x:x[0]).tolist()) - set(df['1924'].map(lambda x:x[0]).tolist()))

# 1923과 1924 서로 다른 단어 (1924에만 존재)
print(set(df['1924'].map(lambda x:x[0]).tolist()) - set(df['1923'].map(lambda x:x[0]).tolist()))

In [None]:
# 1924과 1925 서로 다른 단어 (1924에만 존재)
print(set(df['1924'].map(lambda x:x[0]).tolist()) - set(df['1925'].map(lambda x:x[0]).tolist()))

# 1924과 1925 서로 다른 단어 (1925에만 존재)
print(set(df['1925'].map(lambda x:x[0]).tolist()) - set(df['1924'].map(lambda x:x[0]).tolist()))

In [None]:
# 1935과 1926 서로 다른 단어 (1935에만 존재)
print(set(df['1935'].map(lambda x:x[0]).tolist()) - set(df['1936'].map(lambda x:x[0]).tolist()))

# 1935과 1936 서로 다른 단어 (1936에만 존재)
print(set(df['1936'].map(lambda x:x[0]).tolist()) - set(df['1935'].map(lambda x:x[0]).tolist()))

##### 두 단어의 시계열에 따른 유사도 추이

In [None]:
def time_similarity(main_word, alter_words, max_time):
    simil = {}
    for word in alter_words:
        simil[word] = [t[1] for i in range(0, max_time + 1, 1) for t in chosun_emb.neighborhood(main_word, i, 10000) if t[0]==word]
    return simil

In [None]:
nation = pd.DataFrame.from_dict(time_similarity('민족',['인종','국가','국민','국어','언어','민중','인민','백성','신민','동포','나라','조선','독립','사회주의','자본주의','문학','문명','문화','근대'],max_time),orient='index').T
nation.index = list(range(1920,1941,1))
# nation.to_excel('./results/민족_time_similarity.xlsx')
nation

In [None]:
nation.describe()

In [None]:
nation_국민 = pd.DataFrame.from_dict(time_similarity('국민',['인종','국가','민족','국어','언어','민중','인민','백성','신민','동포','나라','조선','독립','사회주의','자본주의','문학','문명','문화','근대'],max_time),orient='index').T
nation_국민.index = list(range(1920,1941,1))
nation_국민.to_excel('./results/국민_time_similarity.xlsx')
nation_국민

In [None]:
state = pd.DataFrame.from_dict(time_similarity('국가',['인종','국민','민족','국어','언어','민중','인민','백성','신민','동포','나라','조선','독립','사회주의','자본주의','문학','문명','문화','근대'],max_time),orient='index').T
state.index = list(range(1920,1941,1))
state.to_excel('./results/국가_time_similarity.xlsx')
state

##### c. 단어 변곡점

In [None]:
chosun_change_point = pd.DataFrame(chosun_emb.change_points(n=1000000),columns=['연도','단어','change'])
chosun_change_point['연도'] = chosun_change_point['연도'] + 1920
chosun_change_point
# chosun_change_point.to_excel('./results/chosun_change_point.xlsx',index=None)

In [None]:
# 타겟단어 변곡점
chosun_change_point_target = chosun_change_point[chosun_change_point['단어'].isin(target_words)]
chosun_change_point_target = chosun_change_point_target.sort_values(by=['단어','change'],ascending=False).reset_index(drop=True)
chosun_change_point_target.to_excel('./results/chosun_change_point_target.xlsx',index=None)

In [None]:
# @title
# 단어별 변곡점 20개씩 있는지 확인
chosun_change_point_target.value_counts('단어')

In [None]:
chosun_change_point_target[chosun_change_point_target['단어']=='태도'].sort_values(by=['연도'])

In [None]:
chosun_change_point_target.to_excel('./results/chosun_change_point_target_all.xlsx',index=None)

In [None]:
for idx, target_word in tqdm(enumerate(target_words)):
    df = chosun_change_point_target[chosun_change_point_target['단어']==target_word].sort_values(by=['연도'])
    if idx ==0:
        with pd.ExcelWriter(f'./results/chosun_change_point_target.xlsx',mode='w') as writer:
            df.to_excel(writer,sheet_name=target_word,index=None)
    else:
        with pd.ExcelWriter(f'./results/chosun_change_point_target.xlsx',mode='a') as writer:
            df.to_excel(writer,sheet_name=target_word,index=None)

In [None]:
# 타겟단어 변곡점(change max 추출)
chosun_change_max_idx = chosun_change_point_target.groupby(['단어'])['change'].idxmax().tolist()
chosun_change_point_target.iloc[chosun_change_max_idx].sort_values('연도')

In [None]:
# 전체 단어 연도별 변동 평균
chosun_change_point.groupby(['연도'])['change'].mean().to_excel('./results/chosun_change_point_모든단어 연도별 평균.xlsx')
chosun_change_point.groupby(['연도'])['change'].mean()

In [None]:
chosun_change_point_des = chosun_change_point.groupby(['단어'])['change'].describe()

In [None]:
# 변동계수
chosun_change_point_des['cv'] = chosun_change_point_des['std'] / chosun_change_point_des['mean']

In [None]:
# 변화량(표준편차)가 가장 큰 단어들
chosun_change_point_des.sort_values(by=['cv'],ascending=False).to_excel('./results/chosun_change_point_des_all.xlsx')
chosun_change_point_des.sort_values(by=['cv'],ascending=False)

In [None]:
# chosun_change_point_target.groupby(['단어'])['change'].describe().to_excel('./results/chosun_change_point_des.xlsx')
chosun_change_point_target.groupby(['단어'])['change'].describe().sort_values(by=['std'],ascending=False)

### 2. 동아일보

##### a. 단어별 부침(drift)

In [None]:
donga_drift = pd.DataFrame(donga_emb.absolute_drift(n=5000),columns=['drift','단어'])
donga_drift.to_excel('./results/donga_drift.xlsx',index=None)

##### b. 주요 단어 맥락 변화

In [None]:
for idx, target_word in tqdm(enumerate(target_words)):
    try:
        df = time_neigh(donga_emb,target_word,1920,20,100)
    except KeyError:
        print(f'{target_word} 없음')
        continue
    if idx ==0:
        with pd.ExcelWriter(f'./results/donga_time_neigh.xlsx',mode='w') as writer:
            df.to_excel(writer,sheet_name=target_word,index=None)
    else:
        with pd.ExcelWriter(f'./results/donga_time_neigh.xlsx',mode='a') as writer:
            df.to_excel(writer,sheet_name=target_word,index=None)

##### c. 단어 변곡점

In [None]:
donga_change_point = pd.DataFrame(donga_emb.change_points(n=150000),columns=['연도','단어','change'])
donga_change_point['연도'] = donga_change_point['연도'] + 1920
donga_change_point
donga_change_point.to_excel('./results/donga_change_point.xlsx',index=None)

In [None]:
# 타겟단어 변곡점
donga_change_point_target = donga_change_point[donga_change_point['단어'].isin(target_words)]
donga_change_point_target = donga_change_point_target.sort_values(by=['단어','change'],ascending=False).reset_index(drop=True)
donga_change_point_target.to_excel('./results/donga_change_point_target.xlsx',index=None)

In [None]:
# 타겟단어 변곡점(change max 추출)
donga_change_max_idx = donga_change_point_target.groupby(['단어'])['change'].idxmax().tolist()
donga_change_point_target.iloc[donga_change_max_idx].sort_values('연도')