In [1]:
# unset PYTHONPATH first
from ko_text import *

In [2]:
nlp = NLP()

# 불러오기

In [3]:
token_df = pd.read_csv('Data/meta_morphs_final.csv', encoding = 'utf-8')

# 용량을 줄이기 위해 '단어 단어' 꼴로 묶어둔 token을 ['단어', '단어'] 꼴로 풀기
token_df['Token'] = [token.split() for token in token_df['Token']]

In [4]:
token_df.head()

Unnamed: 0,Section,Text,Token,Num of Tokens
0,financial,\n\n\n텀블벅에서 크라우드 펀딩이 이뤄지고 있는 `아침달 시집`.\n\n ...,"[텀블벅, 크라, 우드, 펀딩, 이뤄지고, 아침, 시집, 많지, 않은, 금액, 으로...",263
1,economy,\n\n\n[사진 제공: 연합뉴스]\n\n 유류...,"[유류, 인하, 국제, 유가, 급락, 입어, 국내, 휘발유, 경유, 하락, 특히, ...",166
2,financial,부득이한 사정으로 매월 내는 보험료가 부담이 될 때 계약은 그대로 유지하면서 보험...,"[부득이, 사정, 매월, 내는, 보험료, 부담, 계약, 그대로, 유지, 보험료, 부...",314
3,estate,한때 `미분양의 늪`으로 통하던 경기도 파주시 부동산 시장이 달라지고 있다. 지난해...,"[한때, 미분, 하던, 경기도, 파주시, 부동산, 시장, 달라지고, 분양, 파주, ...",165
4,economy,\n\n\n인디고뱅크의 `미키인서울` 컬래버 맨투맨 <사진제공=월트디즈니코리아>\...,"[인디고, 뱅크, 미키, 서울, 컬래버, 투맨, 월트디즈니, 사의, 마스코트, 미키...",196


In [5]:
token_df.shape

(41418, 4)

## 학습을 위한 데이터 전처리

# Train Test Split

In [6]:
train_size = round(len(token_df) * 0.8)
np.random.seed(0)
train_index_ls = np.random.choice(token_df.index, train_size, replace = False)
test_index_ls = [x for x in token_df.index if not x in train_index_ls]

In [7]:
train_df = token_df.loc[train_index_ls]
test_df = token_df.loc[test_index_ls]

print(train_df.shape, test_df.shape)

(33134, 4) (8284, 4)


In [8]:
Counter(train_df['Section'])

Counter({'bio & tech': 1739,
         'business': 4863,
         'culture & art': 4102,
         'economy': 2608,
         'estate': 3932,
         'financial': 746,
         'it': 1742,
         'politics': 3775,
         'society': 3356,
         'stock': 2508,
         'world': 3763})

In [9]:
train_token_ls = train_df['Token'].tolist()
train_label_ls = train_df['Section'].tolist()

test_token_ls = test_df['Token'].tolist()
test_label_ls = test_df['Section'].tolist()

In [10]:
train_token_ls, train_label_ls = nlp.oversample_batch(train_token_ls, train_label_ls, 3000)
test_token_ls, test_label_ls = nlp.undersample_batch(test_token_ls, test_label_ls, 100)

In [11]:
Counter(train_label_ls)

Counter({'bio & tech': 3000,
         'business': 3000,
         'culture & art': 3000,
         'economy': 3000,
         'estate': 3000,
         'financial': 3000,
         'it': 3000,
         'politics': 3000,
         'society': 3000,
         'stock': 3000,
         'world': 3000})

In [12]:
Counter(test_label_ls)

Counter({'bio & tech': 100,
         'business': 100,
         'culture & art': 100,
         'economy': 100,
         'estate': 100,
         'financial': 100,
         'it': 100,
         'politics': 100,
         'society': 100,
         'stock': 100,
         'world': 100})

# data setup

In [13]:
from collections import defaultdict

import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F
import random
import numpy as np


def read_txt(path_to_file):
    txt_ls = []
    label_ls = []

    with open(path_to_file) as f:
        for i, line in enumerate(f.readlines()[1:]):
            id_num, txt, label = line.split('\t')
            txt_ls.append(txt)
            label_ls.append(int(label.replace('\n','')))
    return txt_ls, label_ls


def convert_word_to_idx(sents):
    for sent in sents:
        yield [w2i_dict[word] for word in sent]
    return

def convert_label_to_idx(labels):
    for label in labels:
        yield l2i_dict[label]
    return


def add_padding(sents, max_len):
    for i, sent in enumerate(sents):
        if len(sent)< max_len:
            sents[i] += [pad] * (max_len - len(sent))
    
        elif len(sent) > max_len:
            sents[i] = sent[:max_len]
    
    return sents

def convert_to_variable(w2i_ls):
    
    var = Variable(torch.LongTensor(w2i_ls))
    return var

In [14]:
w2i_dict = defaultdict(lambda : len(w2i_dict))
pad = w2i_dict['<PAD>']

l2i_dict = defaultdict(lambda : len(l2i_dict))

In [15]:
x_train = list(convert_word_to_idx(train_token_ls))
x_test = list(convert_word_to_idx(test_token_ls))

y_train = list(convert_label_to_idx(train_label_ls))
y_test = list(convert_label_to_idx(test_label_ls))

i2w_dict = {val : key for key, val in w2i_dict.items()}

In [16]:
x_train = convert_to_variable(add_padding(x_train, 100))
x_test = convert_to_variable(add_padding(x_test, 100))

y_train = convert_to_variable(y_train).float()
y_test = convert_to_variable(y_test).float()

# CNN 모델 준비

Pre-train된 Word2Vec은 사용하지 않았습니다.

모든 embedding은 랜덤으로 초기화된 상태로 학습을 진행하였습니다.

In [17]:
class CNN_text(nn.Module):
    
    def __init__(self, n_words, embed_size, hid_size, drop_rate, kernel_size_ls, num_filter, n_category):
        super(CNN_text, self).__init__()
        
        self.embed_size = embed_size
        self.hid_size = hid_size
        self.drop_rate = drop_rate
        self.num_filter = num_filter
        self.kernel_size_ls = kernel_size_ls
        self.num_kernel = len(kernel_size_ls)
        self.n_category = n_category
        
        self.embedding = nn.Embedding(n_words, embed_size)
        self.convs = nn.ModuleList([nn.Conv2d(1, num_filter, (kernel_size, embed_size)) for kernel_size in kernel_size_ls])
        
        self.lin = nn.Sequential(
            nn.Linear(self.num_kernel*num_filter, hid_size), nn.ReLU(), 
            nn.Dropout(drop_rate),
            nn.Linear(hid_size, n_category),
        )
        
    def forward(self, x):
        embed = self.embedding(x) # [batch_size, max_length, embed_size]
        embed.unsqueeze_(1)  # [batch_size, 1, max_length, embed_size]
        conved = [conv(embed).squeeze(3) for conv in self.convs] # [batch_size, num_filter, max_length -kernel_size +1]
        pooled = [F.max_pool1d(conv, (conv.size(2))).squeeze(2) for conv in conved] # [batch_size, num_kernel, num_filter]
        
        concated = torch.cat(pooled, dim = 1) # [batch_size, num_kernel * num_filter]
        logit = self.lin(concated)
        
        return logit
        

In [18]:
n_words = len(w2i_dict)
EMBED_SIZE = 32
HID_SIZE = 32
DROP_RATE = 0.5
KERNEL_SIZE_LS = [3,4,5,6]
NUM_FILTER = 8
N_CATEGORY = 11

In [19]:
model = CNN_text(n_words = n_words, embed_size =EMBED_SIZE, drop_rate= DROP_RATE,
                 hid_size=HID_SIZE, kernel_size_ls= KERNEL_SIZE_LS, num_filter=NUM_FILTER,
                 n_category = N_CATEGORY)

In [20]:
model

CNN_text(
  (embedding): Embedding(125094, 32)
  (convs): ModuleList(
    (0): Conv2d(1, 8, kernel_size=(3, 32), stride=(1, 1))
    (1): Conv2d(1, 8, kernel_size=(4, 32), stride=(1, 1))
    (2): Conv2d(1, 8, kernel_size=(5, 32), stride=(1, 1))
    (3): Conv2d(1, 8, kernel_size=(6, 32), stride=(1, 1))
  )
  (lin): Sequential(
    (0): Linear(in_features=32, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5)
    (3): Linear(in_features=32, out_features=11, bias=True)
  )
)

In [21]:
len(list(model.parameters()))

13

In [22]:
epochs = 50
lr = 0.001
batch_size = 3000

train_idx = np.arange(x_train.size(0))
test_idx = np.arange(x_test.size(0))
optimizer = torch.optim.Adam(model.parameters(),lr)
criterion = nn.CrossEntropyLoss(reduction='sum')

loss_ls = []

for epoch in range(epochs):
    model.train()
    
    # input 데이터 순서 섞기
    random.shuffle(train_idx)
    x_train = x_train[train_idx]
    y_train = y_train[train_idx]
    train_loss = 0

    for start_idx, end_idx in zip(range(0, x_train.size(0), batch_size),
                                  range(batch_size, x_train.size(0)+1, batch_size)):
        x_batch = x_train[start_idx : end_idx]
        y_batch = y_train[start_idx : end_idx].long()
        
        scores = model(x_batch)
        predict = F.softmax(scores, dim = 1).argmax(dim = 1)
        
        acc = (predict == y_batch).sum().item() / batch_size
        
        loss = criterion(scores, y_batch)
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print('Train epoch : %s,  loss : %s,  accuracy :%.3f'%(epoch+1, train_loss, acc))
    print('=================================================================================================')
    
    loss_ls.append(train_loss)
    
    if (epoch+1) % 5 == 0:
        model.eval()
        scores = model(x_test)
        predict = F.softmax(scores, dim=1).argmax(dim = 1)
        
        acc = (predict == y_test.long()).sum().item() / len(y_test)
        loss = criterion(scores, y_test.long())
        
        print('Test Epoch : %s, Test Loss : %.03f , Test Accuracy : %.03f'%(epoch+1, loss.item(), acc))


Train epoch : 1,  loss : 80997.953125,  accuracy :0.100
Train epoch : 2,  loss : 79136.38330078125,  accuracy :0.111
Train epoch : 3,  loss : 78471.46728515625,  accuracy :0.142
Train epoch : 4,  loss : 77977.2451171875,  accuracy :0.164
Train epoch : 5,  loss : 77340.4248046875,  accuracy :0.171
Test Epoch : 5, Test Loss : 2567.579 , Test Accuracy : 0.234
Train epoch : 6,  loss : 76403.77685546875,  accuracy :0.198
Train epoch : 7,  loss : 75206.50537109375,  accuracy :0.211
Train epoch : 8,  loss : 73732.93994140625,  accuracy :0.238
Train epoch : 9,  loss : 71792.05029296875,  accuracy :0.257
Train epoch : 10,  loss : 69572.42919921875,  accuracy :0.295
Test Epoch : 10, Test Loss : 2284.812 , Test Accuracy : 0.308
Train epoch : 11,  loss : 67571.33544921875,  accuracy :0.310
Train epoch : 12,  loss : 65121.134765625,  accuracy :0.307
Train epoch : 13,  loss : 63025.75390625,  accuracy :0.347
Train epoch : 14,  loss : 60838.65283203125,  accuracy :0.362
Train epoch : 15,  loss : 5846