In [40]:
import pandas as pd
import numpy as np

# merge data from media
data = pd.DataFrame({'url':[], 'title':[], 'date':[], 'press':[], 'article':[]})
file_list = ['joongang', 'hk', 'SBS', 'KBS', 'MBC', 'chosun', 'hani', 'khan', 'donga', 'mk']
for name in file_list:
    data = pd.concat([data, pd.read_csv(name + '.csv', engine='python', encoding='utf8')], sort=True)

# drop duplicates and filter too short articles
data = data.dropna(subset=['article', 'date', 'press']).drop_duplicates()
data = data.drop_duplicates(subset=['article'], keep='first')
data = data[data['article'].str.len() > 50].reset_index(drop=True)

In [41]:
data['date'] = data['date'].str.replace(' ', '').str.replace('-', '.')

data['article'] = data['article'].str.replace('아무개', '모')
data['article'] = data['article'].str.replace('[가-힣]{2,3}\s{1,3}기자', '')
data['article'] = data['article'].str.replace(r'\s[김이박최정강조윤장임한오서신권황안송전홍류고문양손배조'
                                              '백허유남심노하A-Z]{1}\s{0,1}[모]{0,1}\s{0,'
                                              '1}\({1}.{1,10}\){1}\s{0,1}(?P<name>[양군씨])', ' 김\g<1> ')
data['article'] = data['article'].str.replace(r'\s[김이박최정강조윤장임한오서신권황안송전홍류고문양손배조'
                                              '백허유남심노하A-Z]{1}\s{0,1}\({1}.{1,'
                                              '10}\){1}\s{0,1}[모]{0,1}\s{0,1}(?P<name>[양군씨])', ' 김\g<1> ')
data['article'] = data['article'].str.replace(r'\s[김이박최정강조윤장임한오서신권황안송전홍류고문양손배조'
                                              '백허유남심노하A-Z]{1}\s{0,1}\s{0,1}[모]{0,1}\s{0,1}(?P<name>[양군씨])', ' 김\g<1> ')

data['article'] = data['article'].str.replace(r'\s[김이박최정강조윤장임한오서신권황안송전홍류고문양손배조'
                                              '백허유남심노하A-Z]\s{0,1}모\s{0,1}씨{1}', ' 김모씨 ')
data['article'] = data['article'].str.replace(r'\s[김이박최정강조윤장임한오서신권황안송전홍류고문양손배조'
                                              '백허유남심노하A-Z]\s{0,1}모\s{0,1} ', ' 김모씨 ')

data['article'] = data['article'].str.replace(r'[연합뉴스] <저작권자 ⓒ 1980-2018 ㈜연합뉴스. 무단 전재 재배포 금지.>', ' ')
data['article'] = data['article'].str.replace(r'연합뉴스 모바일', ' ')
data['pos'] = data['article']
data.to_csv('merged.csv', encoding='utf8', index=False)

In [3]:
data = pd.read_csv('merged.csv')

In [42]:
data = data.dropna(subset=['title', 'article']).reset_index(drop=True)

In [43]:
data = data[(data.title.str.contains('성범죄|성폭행|강간|성폭력|성추행|성희롱')) | 
         (data.article.str.count('성범죄|성폭행|강간|성폭력|성추행|성희롱|추행|미투') >= 1)].reset_index(drop=True)

In [44]:
data['article'] = data['article'].str.replace(' [김은진주윤양현이최박권배날신엄원곽'
                                        '연구변우조오송석민천전정홍강임장한유서'
                                        '선황고노공차안나함모방성]\s{0,1}모{0,1}'
                                        '\s{0,1}씨 ', ' 김모씨 ')
data['article'] = data['article'].str.replace(' [김은진주윤양현이최박권배날신엄원곽'
                                        '연구변우조오송석민천전정홍강임장한유서'
                                        '선황고노공차안나함모방성]\s{0,1}모{0,1}'
                                        '\s{0,1}양 ', ' 김모양 ')
data['article'] = data['article'].str.replace(' [김은진주윤양현이최박권배날신엄원곽'
                                        '연구변우조오송석민천전정홍강임장한유서'
                                        '선황고노차안나함모방성]\s{0,1}모{0,1}'
                                        '\s{0,1}군 ', ' 김모군 ')

In [45]:
from konlpy.tag import Komoran
import re

pos_tagger = Komoran()

def pos(doc): 
    
    stopwords = ['기자', '대한', '위해', '이후', '이나', '에서', '로부터', '면서', 
             '상대로', '하자', '중이', '로서', '대로', '이기', '구독',
            '구독','글방', '한수진', '사회자', '무단', '전재', '복제',
             '콘텐츠', '허브', '배포', '저작권', '베스트', '추천', '포토', '연합뉴스',
             '래빗', '뉴스래빗', '기사', '중앙',  '한경', '한경닷컴', '한겨레', '경향',
             '동아', '동아일보', '조선', '조선일보', '중앙일보', '매일경제', '매일',
             '뉴스', '앵커', '있다', '있는', '하는', 
             '입니', '있었', '아니', '없다', '이런', '않았', '하지', '하기', '하게', 
             '하다', '이어', '하려', '관한', '있어', '이를', '않고', '않은', 
             '어떤', '많은', '하며', '어떻', '있지', '있던', '아닌', '있으', '있고', 
             '있도', '하거', '하던', '하도', '그러', '겁니', '보여', '있거']
    doc = re.sub('|'.join(stopwords), '', doc)
    return ['/'.join(t) for t in pos_tagger.pos(doc) 
            if re.search('NNG|NNP|VV|VA|MM', t[1]) != None and re.search('[0-9]', t[0]) == None]

In [48]:
data['pos'] = [pos(i) for i in data['pos']]

In [49]:
data['pos'] = [' '.join(i) for i in data['pos']]

In [50]:
data.pos = data.pos.str.replace('/VV|/VA', '다/VV').str.replace('/[A-Z]{2,3}', '')

In [51]:
for i in data[data.pos.str.contains('김 모 군') & data.pos.str.contains('육군|병장|상병|군대|헌병대|군기|부대|김군기')].index:
    data.loc[i, 'pos'] = re.sub(data.loc[i, 'pos'], '김 모 군', '육군')

In [52]:
data.pos = data.pos.str.replace(' 김 모양 ', ' 김양 ')
data.pos = data.pos.str.replace(' 김 모 군 ', ' 김군 ')
data.pos = data.pos.str.replace(' 김 모씨 | 김모씨 ', ' 김씨 ')
data.pos = data.pos.str.replace('[김은진주윤양현이최박권배신엄원곽'
                                        '연구변우조오송석민천전정홍강임장한유서'
                                        '황노차나방성]\s{0,1}모{0,1}'
                                        '\s{0,1}(?P<name>[양군씨])', '김\g<1>')
data.pos = data.pos.str.replace(' 양 ', ' 김양 ')
data.pos = data.pos.str.replace(' 씨 ', ' 김씨 ')

In [53]:
data['pos'] = data['pos'].str.replace('갈리아', '메갈리아')
data['pos'] = data['pos'].str.replace('드 루킹', '드루킹')
data['pos'] = data['pos'].str.replace('더불 민주', '더불어민주당')
data['pos'] = data['pos'].str.replace('와인 스티|와인 스타', '와인스타인')
data['pos'] = data['pos'].str.replace('스트로스 칸', '스트로스칸')
data['pos'] = data['pos'].str.replace('양 예원', '양예원')
data['pos'] = data['pos'].str.replace('승우 |여가 ', '')
data['pos'] = data['pos'].str.replace(' 여신 도 | 여신[도]{1,4} | 여신 ', ' 여신도 ')
data['pos'] = data['pos'].str.replace('피의자|용의자', '가해자')
data['pos'] = data['pos'].str.replace('\s{2,}', '\s')

In [54]:
for i in data[data.pos.str.contains('김군') & data.pos.str.contains('육군|병장|상병|군대|헌병대|군기|부대|김군기')].index:
    data.loc[i, 'pos'] = re.sub(data.loc[i, 'pos'], '김군', '육군')

In [61]:
data.pos = data.pos.str.replace('\s[김임최류황송정박강신염민허진윤권오노유엄홍우심문표옥추손]\s', ' 김씨 ')
data.pos = data.pos.str.replace(' 한 | 이 | 말 | 전 | 뒤 | 그 | 때 | 고 | 일 | 성 | 게 | 두 | 이 '
                                '| 하 | 록 | 후 | 안 | 앞 | 모 | 라 | 을 | 의 | 달 | 으 | 주 | 범 | 방 | 곳 | 다 |'
                                ' 면 | 당 | 직 | 날 | 미 | 위 | 반 | 도 | 부 | 약 | 단 | 끝 | 장 | 차 | 세 | 뒤 | 한 | 현 | 급 '
                                ' 사 | 실 | 뜻 | 시 | 요 | 올 | 인 | 새 | 옆 | 르 | 하다 | 되다 | 없다 | 가다 | 같다 ', ' ')
data.pos = data.pos.str.replace(r'\s{2,}', ' ')

In [62]:
import nltk
from nltk import FreqDist

tokens = [j for i in data['pos'] for j in i.split(' ')]
text = nltk.Text(tokens)
freq = FreqDist(text)
# sorted(freq.items(), key=lambda item: item[1], reverse=True)

In [63]:
[i for i in sorted(freq.items(), key=lambda item: item[1], reverse=True) if len(i[0])==1]

[('법', 31074),
 ('집', 30046),
 ('술', 18829),
 ('딸', 15779),
 ('글', 13965),
 ('돈', 13558),
 ('속', 13411),
 ('팀', 12517),
 ('몸', 10185),
 ('여', 9888),
 ('배', 9836),
 ('길', 9556),
 ('첫', 9301),
 ('밤', 8429),
 ('형', 8100),
 ('죄', 7772),
 ('말', 7673),
 ('군', 7668),
 ('남', 7453),
 ('입', 7203),
 ('총', 7132),
 ('사', 6806),
 ('책', 6734),
 ('몇', 6527),
 ('급', 6331),
 ('눈', 6212),
 ('밖', 6083),
 ('삶', 5786),
 ('답', 5706),
 ('목', 5649),
 ('힘', 5579),
 ('옷', 5436),
 ('각', 5170),
 ('발', 5113),
 ('물', 4874),
 ('귀', 4757),
 ('비', 4595),
 ('국', 4473),
 ('예', 4341),
 ('냐', 4283),
 ('병', 4241),
 ('로', 4215),
 ('지', 4183),
 ('관', 3981),
 ('피', 3953),
 ('토', 3839),
 ('해', 3830),
 ('선', 3801),
 ('순', 3744),
 ('영', 3661),
 ('과', 3653),
 ('상', 3613),
 ('감', 3578),
 ('기', 3487),
 ('잠', 3270),
 ('애', 3257),
 ('테', 3246),
 ('학', 3241),
 ('역', 3194),
 ('소', 3179),
 ('동', 3165),
 ('갑', 3137),
 ('불', 3085),
 ('금', 2961),
 ('탓', 2820),
 ('네', 2773),
 ('칸', 2700),
 ('드', 2618),
 ('극', 2560),
 ('꿈', 2549),
 ('처', 2534

날짜 단위로 잘라서 저장

In [64]:
data['date'] = [i[:7] for i in data['date']]

In [65]:
for i in data[data['date'].str.len() == 4]['date'].index:
    data.loc[i, 'date'] = data['date'][i] + '.01.01'
data = data[data['date'].str.len() >= 7].reset_index(drop=True)
data = data[data['date'].str.contains('[^\d.]') == False].reset_index(drop=True)

In [66]:
year = 2002
month = 1

In [69]:
import numpy as np
data['dindex'] = [np.nan for _ in range(0, len(data['url']))]
data['dindex'] = [12*(int(i.split('.')[0]) - year) + int(i.split('.')[1]) - month for i in data['date']]

In [70]:
data = data[data['date'] > '2002.01']

In [71]:
data['date'] = [i[:7] for i in data['date']]

In [72]:
data = data.dropna(subset=['article', 'date', 'press', 'pos'])

In [73]:
data.to_csv('processed_2002.csv', encoding='utf8', index=False)

In [50]:
import sys
import csv
import pandas as pd
data = pd.read_csv('processed_2002.csv', encoding='utf8', engine='python')