In [1]:
from gensim.models import FastText
from konlpy.tag import Hannanum
import sentencepiece as spm

from tqdm import trange
import os



In [2]:
os.getenv("Java_HOME")

'C:\\Program Files\\Zulu\\zulu-8\\jre\\bin\\server'

In [3]:
dataset_train = []
dataset_test = []
dataset_all = []

root = "newsData/"
list = os.listdir(root)
for cat in list:
    files = os.listdir(root + cat)
    for i,f in enumerate(files):
        fname = root + cat + "/" + f
        file = open(fname, "r", encoding="utf8")
        strings = file.read()
        if i<170:
            dataset_train.append([strings, cat])
        else:
            dataset_test.append([strings,cat])
        dataset_all.append(strings)
        file.close()

print(len(dataset_train), len(dataset_test))

1360 240


In [None]:
#형태소 단위
hannanum = Hannanum()
vocab_morphs = set()
tokened_morphs = []
with trange(len(dataset_all)) as tr:
    for i in tr:
        morphs = hannanum.morphs(dataset_all[i])
        for morph in morphs:
            vocab_morphs.add(morph)
        tokened_morphs.append(morphs)

In [None]:
f = open("allsentence.txt","w")
f.write("".join(dataset_all).replace("\xa0", ""))
f.close()

In [None]:
#subword 단위
corpus = "allsentence.txt"
prefix = "news"
vocab_size = 8000
spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" + 
    " --model_type=bpe" +
    " --max_sentence_length=999999" + # 문장 최대 길이
    " --pad_id=0 --pad_piece=[PAD]" + # pad (0)
    " --unk_id=1 --unk_piece=[UNK]" + # unknown (1)
    " --bos_id=2 --bos_piece=[BOS]" + # begin of sequence (2)
    " --eos_id=3 --eos_piece=[EOS]" + # end of sequence (3)
    " --user_defined_symbols=[SEP],[CLS],[MASK]") # 사용자 정의 토큰

In [None]:
vocab_file = "news.model"
vocab = spm.SentencePieceProcessor()
vocab.load(vocab_file)
line = "안녕하세요 만나서 반갑습니다"
pieces = vocab.encode_as_pieces(line)
ids = vocab.encode_as_ids(line)
print(line)
print(pieces)
print(ids)


안녕하세요 만나서 반갑습니다
['▁안', '녕', '하', '세요', '▁만나', '서', '▁반', '갑', '습니다']
[89, 7577, 6518, 2892, 957, 6521, 126, 7021, 107]



In [None]:
# vocab.encode_as_pieces(dataset_all[0])
tokened_sp = []
with trange(len(dataset_all)) as tr:
    for i in tr:
        tokened_sp.append(vocab.encode_as_pieces(dataset_all[i]))

100%|██████████| 1600/1600 [00:01<00:00, 1228.37it/s]


In [None]:
emb_num = 128

embedding = FastText(tokened_morphs, vector_size=emb_num, window=12, min_count=5, sg=1)
embedding.save("fasttext_morph.model")

In [None]:
emb_num = 128

embedding = FastText(tokened_sp, vector_size=emb_num, window=10, min_count=2, sg=1)
embedding.save("fasttext_sp.model")

In [None]:
emb_num = 128
model_morphs = FastText.load("fasttext_morph.model")
model_sp = FastText.load("fasttext_sp.model")

In [None]:
model_sp.wv.most_similar("국회의원")

[('▁국회의원', 0.9339150786399841),
 ('의원', 0.820768415927887),
 ('▁출마', 0.8091025948524475),
 ('▁현역', 0.7669360041618347),
 ('▁지방선거에', 0.762986421585083),
 ('궐선거', 0.7626964449882507),
 ('▁사직서', 0.7208353281021118),
 ('▁송파을', 0.7189249396324158),
 ('▁사직', 0.7137511372566223),
 ('▁의원', 0.713568389415741)]

In [None]:
model_morphs.wv.most_similar("국회의원")

[('의원직', 0.9015116095542908),
 ('국회의장', 0.8992209434509277),
 ('사직', 0.8932391405105591),
 ('출마', 0.8877411484718323),
 ('사직서', 0.8795743584632874),
 ('현역의원', 0.8716889023780823),
 ('사퇴', 0.8710533976554871),
 ('의원들', 0.8594748973846436),
 ('보궐선거', 0.8586255311965942),
 ('현역의원들', 0.8532490134239197)]

In [None]:
# model_morphs.wv[(hannanum.morphs("안녕하세요 감사합니다"))]

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

In [None]:
device = torch.device("cuda")

In [None]:
#using fasttext
class SentenceDataset(Dataset):
    def __init__(self, dataset, tokenizer, fasttextModel, max_len):
        self.sentences = []
        with trange(len(dataset)) as tr:
            for i in tr:
                sen = dataset[i][0]
                sen = tokenizer(sen)
                if len(sen) < max_len:
                    sen = sen + (max_len-len(sen)) * [""]
                sen = sen[:max_len]
                sen = fasttextModel[sen]
                self.sentences.append(sen)
        self.labels = [np.int32(i[1]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i],self.labels[i])

    def __len__(self):
        return (len(self.labels))

In [None]:
# using nn.Embedding
# class SentenceDataset2(Dataset):
#     def __init__(self, dataset, tokenizer, max_len):
#         self.sentences = []
#         with trange(len(dataset)) as tr:
#             for i in tr:
#                 sen = dataset[i][0]
#                 sen = tokenizer(sen)
#                 if len(sen) < max_len:
#                     sen = sen + (max_len-len(sen)) * [0]
#                 sen = sen[:max_len]
#                 self.sentences.append(sen)
                
#         self.sentences = np.array(self.sentences)
#         self.labels = [np.int32(i[1]) for i in dataset]

#     def __getitem__(self, i):
#         return (self.sentences[i],self.labels[i])

#     def __len__(self):
#         return (len(self.labels))

In [None]:
max_len = 32
# data_train = SentenceDataset(dataset_train, vocab.encode_as_pieces, model_sp.wv, max_len)
# data_test = SentenceDataset(dataset_test, vocab.encode_as_pieces,model_sp.wv, max_len)

hannanum = Hannanum()
data_train = SentenceDataset(dataset_train, hannanum.morphs, model_morphs.wv, max_len)
data_test = SentenceDataset(dataset_test, hannanum.morphs,model_morphs.wv, max_len)

100%|██████████| 1360/1360 [01:30<00:00, 14.99it/s]
100%|██████████| 240/240 [00:13<00:00, 17.28it/s]


In [None]:
# max_len = 64
# data_train = SentenceDataset2(dataset_train, vocab.encode_as_ids, max_len)
# data_test = SentenceDataset2(dataset_test, vocab.encode_as_ids, max_len)


In [None]:
batch_size = 64
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5, shuffle=True)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, num_classes, hidden_size, num_layers = 1):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_classes = num_classes

        # self.emb = nn.Embedding(num_embeddings = len(vocab), embedding_dim = 128, padding_idx=0)

        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size,
                            num_layers = num_layers, batch_first = True)

        self.linear = nn.Linear(hidden_size, hidden_size//2)
        self.linear2 = nn.Linear(hidden_size//2, num_classes)
        self.relu = nn.ReLU()
        self.dropout = torch.nn.Dropout(p=0.2)
    
        self.fc = nn.Sequential(self.linear, self.dropout, self.relu, self.linear2, self.dropout)
        # self.fc = nn.Sequential(self.linear, self.dropout)

    def forward(self, x_input):
        # x_input = self.emb(x_input)
        lstm_out, (h,c) = self.lstm(x_input)
        output = self.fc(lstm_out[:,-1,])
        
        # hidden = torch.cat((h[-2,:,:], h[-1,:,:]), dim = 1)
        # output=self.fc(hidden)
        # output = self.relu(output)
        # output = self.linear2(output)
        return output

In [None]:
lstm = LSTM(emb_num, 8, 128, 2).to(device)

optimizer = torch.optim.Adam(lstm.parameters(), lr = 0.0003)
criterion = nn.CrossEntropyLoss()
epochs = 100


with trange(epochs) as tr:
    for i in tr:
        itloss = 0
        trainacc = 0
        testacc = 0
        
        lstm.train()
        for batch_id, (input, label) in enumerate(train_dataloader):
            optimizer.zero_grad()
            input = input.to(device)
            label = label.long().to(device)
            out = lstm(input)
            loss = criterion(out, label)
            loss.backward()
            optimizer.step()
            itloss += loss.cpu().item()
            trainacc += calc_accuracy(out,label)


        lstm.eval()
        for batch_idt, (input, label) in enumerate(test_dataloader):
            input = input.to(device)
            label = label.long().to(device)
            out = lstm(input)
            testacc += calc_accuracy(out,label)

        tr.set_postfix(trainacc="{0:.3f}".format(trainacc/(batch_id+1)), loss="{0:.3f}".format(itloss/(batch_id+1)),  testacc="{0:.3f}".format(testacc/(batch_idt+1)))

100%|██████████| 100/100 [01:30<00:00,  1.10it/s, loss=0.432, testacc=0.723, trainacc=0.822]


In [None]:
cate = ["정치","경제","사회", "생활/문화","세계","기술/IT", "연예", "스포츠"]
def softmax(vals, idx):
    valscpu = vals.cpu().detach().squeeze(0)
    a = 0
    for i in valscpu:
        a += np.exp(i)
    
    tmp = []
    for i in valscpu:
        tmp.append(((np.exp(i))/a).item() * 100)
    print(["{}:{:.2f}%".format(cate[i],v) for i,v in enumerate(tmp)])

    return ((np.exp(valscpu[idx]))/a).item() * 100

def test_model(seq, model, tokenizer, fasttextmodel):
    sen = tokenizer(seq)
    # sen = vocab.encode_as_ids(seq)
    if len(sen) < max_len:
        # sen = sen + (max_len-len(sen)) * [1]
        sen = sen + (max_len-len(sen)) * [""]
    sen = sen[:max_len]
    sen = fasttextmodel[sen]
    sen = torch.tensor(sen).unsqueeze(0).to(device)
    model.eval()
    result = model(sen)
    idx = result.argmax().cpu().item()
    print("뉴스의 카테고리는:", cate[idx])
    print("신뢰도는:", "{:.2f}%".format(softmax(result,idx)))
    

In [None]:
# test_model('신형 아이패드 m1칩 탑재 예정', lstm, vocab.encode_as_pieces, model_sp.wv)
test_model("신형 아이패드 프로에 m1칩 탑재 예정", lstm, hannanum.morphs, model_morphs.wv)

뉴스의 카테고리는: 기술/IT
['정치:0.00%', '경제:0.16%', '사회:0.15%', '생활/문화:3.64%', '세계:0.12%', '기술/IT:94.37%', '연예:0.13%', '스포츠:1.42%']
신뢰도는: 94.37%


In [None]:
# 차명종 서대현 김라희 등 韓7명 호치민3쿠션월드컵 PPPQ통과
# 한국 선수 간 대결이 치러진 A~D조에서는 서대현
# 대통령, 국회의원 지지율 감소
# 신형 아이패드 m1칩 탑재 예정

In [None]:
torch.save(lstm.state_dict(), "news_lstm.pt")

<All keys matched successfully>

In [None]:
import torch
from konlpy.tag import Hannanum

emb_num=128
max_len=32
model_morphs = FastText.load("fasttext_morph.model")

device = torch.device("cuda", index=1)
modelload = LSTM(emb_num, 8, 128, 2).to(device)
modelload.load_state_dict(torch.load("news_lstm.pt", device))
hannanum = Hannanum()

In [None]:
test_model("신형 아이패드 프로에 m1칩 탑재 예정", modelload, hannanum.morphs, model_morphs.wv)

뉴스의 카테고리는: 기술/IT
['정치:0.00%', '경제:0.16%', '사회:0.15%', '생활/문화:3.64%', '세계:0.12%', '기술/IT:94.37%', '연예:0.13%', '스포츠:1.42%']
신뢰도는: 94.37%
