In [7]:
import pandas as pd
from collections import Counter, defaultdict
import pickle

In [2]:
data = pd.read_csv('data/raw/MK_2018_No_0_to_10000.csv')

with open('stock_name_ls.pickle', 'rb') as f:
    stock_name_ls = pickle.load(f)

In [4]:
def drop_uselsess_data(data):
    useless_keyword_ls = ['신년사]','[인사]','[포토]','포토','MK포토']
    
    index_ls = data.index
    title_ls = data['Title'].tolist()
    
    drop_index_ls = []
    for idx, title in zip(index_ls, title_ls):
        if any(keyword in title for keyword in useless_keyword_ls):
            drop_index_ls.append(idx)
            
    return data.drop(drop_index_ls)

def reclassify_categories(data, input_category, output_category):
    '''
    input_category에 해당하는 카테고리를 output_category로 변환하는 함수입니다.
    
    inputs
    =================================
    data : pandas.DataFrame
        크롤링을 마친 raw data 상태의 DataFrame
    
    input_category : str, list
        재분류 전 카테고리
        
    output_category : str, list
        재분류 후 카테고리
    '''
    if type(input_category) == str:
        input_category = [input_category]
    
    # category reclassification
    data.loc[data['Section'].isin(input_category), 'Section'] = output_category
    return data


def to_business(data, stock_name_ls):
    '''
    economy, special_edition, health의 일부 기사를 business로 재분류 
    retail, it, financial, electronics, autos, chemistry, heavy_industries의 모든 기사를 기업(business)로 재분류
    
    inputs
    =================================
    data : pandas.DataFrame
        크롤링을 마친 raw data 상태의 DataFrame
    
    stock_name_ls : list,
        기업명이 str형태로 저장된 list
    '''
    
    # economy와 health 기사 제목에 상장종목 명이 포함된 경우 business로 재분류
    section_ls = ['economy','special_edition', 'health']
    temp_df = data.loc[data['Section'].isin(section_ls)]

    index_ls = temp_df.index
    title_ls = temp_df['Title'].tolist()
        
    reclassification_idx_ls = []    
    
    for idx, title in zip(index_ls, title_ls):
        if any(stock_name in title for stock_name in stock_name_ls):
            reclassification_idx_ls.append(idx)
            
    data.loc[reclassification_idx_ls, 'Section'] = 'business'
    
    
    
    # retail, it, financial, electronics, autos, chemistry, heavy_industries는 전부 다 business로 재분류
    input_category_ls = ['retail','it','financial', 'electronics', 'autos', 'chemistry', 'heavy_industries']
    
    data = reclassify_categories(data, input_category_ls, 'business')
    return data

def to_stock(data):
    '''
    경제, 기업, health, special_edition 기사의 일부를
    증권(stock)기사로 재분류 하는 함수입니다.
    
    inputs
    =================================
    data : pandas.DataFrame
        크롤링을 마친 raw data 상태의 DataFrame
    '''
    section_ls = ['economy', 'business','health','special_edition']
    temp_df = data.loc[data['Section'].isin(section_ls)]
    
    index_ls = temp_df.index
    title_ls = temp_df['Title'].tolist()
        
    to_stock_idx_ls = []
    keyword_ls = ['코스피', '코스닥', '증시','주가','주식','목표가','상장','특징주', '증자', 
                  '영업익', '공시', '지분', '매출','이익', 'Hot-Line', '펀드',
                  '키움증권','NH투자','KB증권','미래에셋대우','신한금투','대신증권', 'KTB투자증권',
                  '한투증권', '현대차투자증권', '유안타증권', '유진투자', '메리츠종금']

    for idx, title in zip(index_ls, title_ls):
        if any(keyword in title for keyword in keyword_ls):
            to_stock_idx_ls.append(idx) 
            
    data.loc[to_stock_idx_ls, 'Section'] = 'stock'
    return data

def to_economy(data):
    '''
    기업, 증권 기사에서 경제(economy)기사로 재분류 하는 함수입니다.
    
    inputs
    =================================
    data : pandas.DataFrame
        크롤링을 마친 raw data 상태의 DataFrame
    '''
    section_ls = ['stock', 'business', 'special_edition']
    temp_df = data.loc[data['Section'].isin(section_ls)]
    
    index_ls = temp_df.index
    title_ls = temp_df['Title'].tolist()
        
    reclassification_ls = []
    keyword_ls = ['경제', '업종','환율','핀테크','산업혁명','가상화폐','비트코인', '금리','유가',
                  '임금',]

    for idx, title in zip(index_ls, title_ls):
        if any(keyword in title for keyword in keyword_ls):
            reclassification_ls.append(idx)
    
    data.loc[reclassification_ls, 'Section'] = 'stock'
    return data

def reclassify_culture(data):
    '''
    entertainment와 culture 기사에서 culture & art로 재분류 하는 함수입니다.
    inputs
    =================================
    data : pandas.DataFrame
        크롤링을 마친 raw data 상태의 DataFrame
    '''
    # 1. 일기예보(weather-forecast) 분류
    temp_df = data[data['Section'] == 'culture']
    index_ls = temp_df.index
    title_ls = temp_df['Title'].tolist()
    
    keyword_ls = ['기온', '날씨','온도', '영하', '한파', '눈', '추위', '폭설', '적설량', '대설', 
                  '영상', '낮','폭염', '비','더위','폭우','강수량', '장마',
                  '쌀쌀','맑고','구름','미세먼지','안개','바람','찜통']
    
    reclassification_index_ls = []
    for idx, title in zip(index_ls, title_ls):
        if any(keyword in title for keyword in keyword_ls):
            reclassification_index_ls.append(idx)
    
    data.loc[reclassification_index_ls, 'Section'] = 'weather-forecast'
    
    
    # 2. culture & art 분류
    section_ls = ['entertainment','culture']
    temp_df = data[data['Section'].isin(section_ls)]
    
    keyword_ls = ['박물관','미술','전시','신간','작품','피아니스트','바이올','아트',
              '예술','유물','展','소설','수필','문학','에세이','발간','출간','사진',
              '뮤지컬','영화','개봉','완간']
    
    index_ls = temp_df.index
    title_ls = temp_df['Title'].tolist()
    
    reclassfication_ls = []
    for idx, title in zip(index_ls, title_ls):
        if any(keyword in title for keyword in keyword_ls):
            reclassfication_ls.append(idx)
    
    data.loc[reclassfication_ls, 'Section'] = 'culture & art'
    
    return data.drop(data[data['Section'] == 'culture'].index)

def drop_categories(data, drop_category_ls):
    '''
    ' ', people, opinion, special_edition 카테고리 및, 전체 비중의 0.1% 이하를 차지하는 기사를 전부 제거
    
    inputs
    =================================
    data : pandas.DataFrame
        크롤링을 마친 raw data 상태의 DataFrame
        
    drop_category_ls : str, list
        제거 대상 category 목록
    '''
    
    # 원하지 않는 카테고리는 제거
    if type(drop_category_ls) == str:
        drop_category_ls = [drop_category_ls]
    
    drop_index_ls = data[data['Section'].isin(drop_category_ls)].index
    data.drop(drop_index_ls, inplace = True)
    
    
    # 전체 비중의 0.1% 이하의 카테고리 제거
    ratio_huddle = len(data) // 1000
    
    total_category_ls = list(set(data['Section']))
    counter = Counter(data['Section'])
    
    useful_category_ls = []
    for category in total_category_ls:
        if counter[category] > ratio_huddle:
            useful_category_ls.append(category)
    
    return data.loc[data['Section'].isin(useful_category_ls)]

In [11]:
data = drop_uselsess_data(data)

data = reclassify_categories(data, 
                            ['tv_broadcasting', 'entertainment_topic', 'broadcasting', 'hot_issue', 'music', 'overseas_etn'], 
                            'entertainment')

data = reclassify_categories(data, 'golf', 'sports')
data = reclassify_categories(data, ['movie','performance'], 'culture & art')
data = reclassify_categories(data, 'patent', 'technology')

data = to_business(data, stock_name_ls)

data = to_stock(data)

data = to_economy(data)

data = reclassify_culture(data)

data = drop_categories(data, [' ','opinion','people','special_edition'])

In [16]:
counter = Counter(data['Section']).most_common()

In [17]:
counter

[('stock', 1520),
 ('society', 1202),
 ('entertainment', 994),
 ('business', 846),
 ('politics', 708),
 ('world', 667),
 ('sports', 468),
 ('economy', 404),
 ('estate', 164),
 ('culture & art', 134),
 ('travel', 93),
 ('weather-forecast', 81),
 ('health', 48),
 ('technology', 41)]

In [18]:
data.shape

(7370, 6)

In [19]:
from ko_text import *

In [20]:
nlp = NLP()

# 텍스트 클렌징을 위한 정규표현식 or 구문 추가

In [21]:
regex_ls = ['여기를 누르시면 크게 보실 수 있습니다']

In [22]:
nlp.add_regex(regex_ls)

# 불용어(stopwords) 사전 추가

In [23]:
stopword_ls = ['googletagdisplay',
               'windowjQuery',
               'documentwrite',
               ]

In [24]:
nlp.add_stopwords(stopword_ls)

# Tokenizing

In [25]:
token_doc_ls = nlp.extract_morphs_for_all_document_FAST_VERSION(data['Text'].tolist(),
                                                                n_thread = 4)

data = data

data['Token'] = token_doc_ls

## Token 분포 확인

In [26]:
counter = Counter([token for doc in token_doc_ls for token in doc])

In [None]:
counter.most_common()

In [None]:
'''
for token in np.random.choice(data['Token'],100):
    input('아무 키나 ')
    print(' '.join(token))
    '''

# 중복되는 기사는 제거

In [29]:
# index 초기화
data.index = np.arange(len(data))

In [30]:
text_ls = []
unique_idx_ls = []

for idx, text in enumerate(data['Text']):
    if not text in text_ls:
        text_ls.append(text)
        unique_idx_ls.append(idx)

In [31]:
print(len(data), len(unique_idx_ls))

7370 7247


In [32]:
data = data.loc[unique_idx_ls]
data.index = np.arange(len(data))

# 토큰 길이 비교

In [33]:
len_ls = []

for token in data['Token']:
    len_ls.append(len(token))

data['num_token'] = len_ls

In [34]:
data.groupby('Section')['num_token'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Section,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
business,843.0,161.62159,122.209004,0.0,79.0,132.0,219.5,762.0
culture & art,134.0,143.768657,128.294534,0.0,56.25,113.0,194.0,726.0
economy,403.0,198.307692,158.117658,0.0,105.0,171.0,255.5,2058.0
entertainment,994.0,97.725352,103.180383,0.0,5.0,86.0,139.0,971.0
estate,164.0,189.469512,108.964081,22.0,121.25,170.0,228.0,849.0
health,48.0,241.604167,176.790866,54.0,123.75,197.0,274.25,760.0
politics,652.0,199.694785,132.15329,0.0,104.0,166.0,268.75,884.0
society,1187.0,152.967144,105.868765,0.0,83.0,128.0,200.0,836.0
sports,468.0,159.326923,105.403609,2.0,86.75,136.0,208.0,755.0
stock,1483.0,73.679703,105.173486,0.0,7.0,16.0,97.5,1284.0


In [44]:
# 토큰의 길이가 30개 이상인 표본만 추출
data = data[data['num_token'] > 30]

In [46]:
token_df = data

# Train Test Split

In [47]:
train_size = round(len(token_df) * 0.8)
np.random.seed(0)
train_index_ls = np.random.choice(token_df.index, train_size, replace = False)
test_index_ls = [x for x in token_df.index if not x in train_index_ls]

In [48]:
train_df = token_df.loc[train_index_ls]
train_df.shape

(4648, 8)

In [49]:
test_df = token_df.loc[test_index_ls]
test_df.shape

(1162, 8)

In [71]:
Counter(train_df['Section'])

Counter({'business': 600,
         'culture & art': 89,
         'economy': 311,
         'entertainment': 533,
         'estate': 134,
         'health': 37,
         'politics': 498,
         'society': 885,
         'sports': 348,
         'stock': 535,
         'technology': 31,
         'travel': 72,
         'weather-forecast': 67,
         'world': 508})

In [72]:
Counter(test_df['Section'])

Counter({'business': 155,
         'culture & art': 24,
         'economy': 67,
         'entertainment': 117,
         'estate': 29,
         'health': 11,
         'politics': 132,
         'society': 231,
         'sports': 77,
         'stock': 133,
         'technology': 9,
         'travel': 21,
         'weather-forecast': 9,
         'world': 147})

# sampling for training classfier

In [56]:
# 한 label마다 학습할 단어의 수
train_batch_size = 300
test_batch_size = 30

train_token_ls_split, train_tag_ls_split = nlp.oversample_batch(train_df['Token'], train_df['Section'], train_batch_size)
test_token_ls_split, test_tag_ls_split =  nlp.oversample_batch(test_df['Token'],test_df['Section'], test_batch_size)

In [57]:
print(len(train_token_ls_split))
Counter(train_tag_ls_split)

4200


Counter({'business': 300,
         'culture & art': 300,
         'economy': 300,
         'entertainment': 300,
         'estate': 300,
         'health': 300,
         'politics': 300,
         'society': 300,
         'sports': 300,
         'stock': 300,
         'technology': 300,
         'travel': 300,
         'weather-forecast': 300,
         'world': 300})

In [58]:
Counter(test_tag_ls_split)

Counter({'business': 30,
         'culture & art': 30,
         'economy': 30,
         'entertainment': 30,
         'estate': 30,
         'health': 30,
         'politics': 30,
         'society': 30,
         'sports': 30,
         'stock': 30,
         'technology': 30,
         'travel': 30,
         'weather-forecast': 30,
         'world': 30})

<br>


# 잘 뽑혔는지 확인

In [63]:
' '.join(train_token_ls_split[1])

'이병철 KTB 투자 증권 부회장 성문 회장 전량 매수 최대 주주 올라서게 이로써 불거진 KTB 투자 증권 경영 분쟁 논란 부회장 승리 맺음 금융투자 업계 부회장 측은 회장 최대 주주 변경 논의 회장 보유 전량 매수 합의 부회장 KTB 투자 증권 최대 주주 올라서게 회장 의결권 주식 기준 가운데 사들이기로 계약금 입금 완료 아울러 회장 추가 매입 나머지 역시 회장 요구 대로 주당 이자 더해 분할 매수 했다이밖에 회장 요구 회장 비서실 임원 직원 여명 고용 보장 합의 합의 부회장 주주 에서 주주 올라서게 회장 경영 일선 에서 물러날 전망'

In [64]:
train_tag_ls_split[1]

'stock'

# Doc2Vec Parameter 튜닝

## **set ALPHA as default**

In [65]:
token_ls = train_df['Token'].tolist() + test_df['Token'].tolist()
label_ls = train_df['Section'].tolist() + test_df['Section'].tolist()

In [66]:
len(token_ls), len(label_ls)

(5810, 5810)

In [None]:
%%time

result_dict = {
                'corpus_count' : [],
               'min_count' : [],
               'vector_size' : [],
               'window' : [],
               'n_epochs' : [],
               'accuracy' : [],
               'sample' : [],
               'dm' : [],
              }

testing_section_ls = ['경제','기업','사회','국제','부동산','증권','정치','IT과학','문화']

# 하이퍼 파라미터 튜닝 작업 수행

for dm in [1]:
    for doc2vec_size in ['']:
        if doc2vec_size == '':
            x_split, y_split = token_ls, label_ls

        for sample in [1e-04, 1e-05, 1e-06]:
            for min_count in [1, 5, 15, 50]:
                for vector_size in [100,300]:
                    for window in [5,15]:
                        for n_epochs in [10]:

                            # Doc2Vec 모델 생성
                            nlp.make_Doc2Vec_model(dm = dm,
                                                   min_count = min_count,
                                                   sample = sample,
                                                   vector_size = vector_size,
                                                   window = window,
                                                   dm_mean = 0,
                                                   dm_concat = 0)

                            nlp.build_and_train_Doc2Vec_model(x_split,
                                                              y_split,
                                                              n_epochs = n_epochs)


                            model_name = 'Doc2Vec_dm=%s&cc=%s&vs=%s&win=%s&min=%s&sample=%s&epochs=%s'%(\
                                                                                                                 nlp.Doc2Vec_model.dm,
                                                                                                                   nlp.Doc2Vec_model.corpus_count,
                                                                                                                   nlp.Doc2Vec_model.vector_size,
                                                                                                                   nlp.Doc2Vec_model.window,
                                                                                                                   nlp.Doc2Vec_model.min_count,
                                                                                                                   nlp.Doc2Vec_model.sample,
                                                                                                                   nlp.Doc2Vec_model.epochs)
                            # Doc2Vec 모델 저장
                            # nlp.Doc2Vec_model.save('Doc2Vec_model/'+model_name)
                            X =nlp.infer_vectors_with_Doc2Vec(train_token_ls_split, alpha = 0.1)

                            tsne= TSNE(n_components=2)
                            X_tsne = tsne.fit_transform(X)
                            scatter_df = pd.DataFrame(X_tsne,
                                                      index = train_tag_ls_split,
                                                      columns = ['x','y'])

                            plt.figure(figsize = (10, 10))

                            for i,section in enumerate(set(test_df['Section'])):
                                temp_df = scatter_df[scatter_df.index == section]
                                plt.scatter(temp_df['x'].values, temp_df['y'].values, label = section, c = np.random.rand(3,))

                            plt.legend(loc = 'best')
                            plt.savefig('추정된 벡터 분포 t-sne ver')
                            
                            # clf를 각 레이블별 1000개씩 학습, 
                            X_train = nlp.infer_vectors_with_Doc2Vec(train_token_ls_split)
                            y_train = train_tag_ls_split

                            X_test = nlp.infer_vectors_with_Doc2Vec(test_token_ls_split)
                            y_test = test_tag_ls_split


                            clf = LogisticRegression(solver = 'sag',
                                                     multi_class = 'multinomial')


                            clf.fit(X_train, y_train)
                            y_pred = clf.predict(X_test)
                            
                            result_dict['dm'].append(nlp.Doc2Vec_model.dm)
                            result_dict['corpus_count'].append(nlp.Doc2Vec_model.corpus_count)
                            result_dict['min_count'].append(nlp.Doc2Vec_model.min_count)
                            result_dict['vector_size'].append(nlp.Doc2Vec_model.vector_size)
                            result_dict['window'].append(nlp.Doc2Vec_model.window)
                            result_dict['n_epochs'].append(nlp.Doc2Vec_model.epochs)
                            result_dict['sample'].append(nlp.Doc2Vec_model.sample)
                            result_dict['accuracy'].append(accuracy_score(y_pred, y_test))

                            print(accuracy_score(y_pred, y_test))

                            
            pd.DataFrame(result_dict).to_csv('Parameter_tuning_result.csv', index = False)

# 결과 저장
- ## 최초 1회만 수행한 후, 결과를 저장하여 불러와서 쓴당

### **저장 공간 절약을 위해, ['단어', '단어'] 꼴로 저장된 토큰을 '단어 단어' 꼴로 바꿔준다**

In [40]:
token_df['Token'] = [' '.join(doc) for doc in token_df['Token'].tolist()]

# 저장

In [42]:
token_df.to_csv('Data/meta_morphs.csv', index = False)
#test_df.to_csv('Data/test_morphs_final.csv', index = False)