In [5]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import sentencepiece as spm

from tqdm import trange
import os

In [6]:
dataset_train = []
dataset_test = []
dataset_all = []

root = "newsData/"
list = os.listdir(root)
for cat in list:
    files = os.listdir(root + cat)
    for i,f in enumerate(files):
        fname = root + cat + "/" + f
        file = open(fname, "r", encoding="utf8")
        strings = file.read()
        if i<170:
            dataset_train.append([strings, cat])
        else:
            dataset_test.append([strings,cat])
        dataset_all.append(strings)
        file.close()

print(len(dataset_train), len(dataset_test))

1360 240


In [8]:
f = open("allsentence.txt","w", encoding="utf8")
f.write("".join(dataset_all).replace("\xa0", ""))
f.close()

In [9]:
#subword 단위
corpus = "allsentence.txt"
prefix = "news"
vocab_size = 8000
spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" + 
    " --model_type=bpe" +
    " --max_sentence_length=999999" + # 문장 최대 길이
    " --pad_id=0 --pad_piece=[PAD]" + # pad (0)
    " --unk_id=1 --unk_piece=[UNK]" + # unknown (1)
    " --bos_id=2 --bos_piece=[BOS]" + # begin of sequence (2)
    " --eos_id=3 --eos_piece=[EOS]" + # end of sequence (3)
    " --user_defined_symbols=[SEP],[CLS],[MASK]") # 사용자 정의 토큰

In [10]:
vocab_file = "news.model"
vocab = spm.SentencePieceProcessor()
vocab.load(vocab_file)
cab_file = "news.model"
vocab = spm.SentencePieceProcessor()
vocab.load(vocab_file)
line = "안녕하세요 만나서 반갑습니다"
pieces = vocab.encode_as_pieces(line)
ids = vocab.encode_as_ids(line)
print(line)
print(pieces)
print(ids)

안녕하세요 만나서 반갑습니다
['▁안', '녕', '하', '세요', '▁만나', '서', '▁반', '갑', '습니다']
[89, 7577, 6518, 2892, 957, 6521, 126, 7021, 107]


In [11]:
tokened_sp = []
with trange(len(dataset_all)) as tr:
    for i in tr:
        tokened_sp.append(vocab.encode_as_pieces(dataset_all[i]))

100%|██████████| 1600/1600 [00:00<00:00, 2235.07it/s]


In [12]:
class Doc2VecCorpus:
    def __iter__(self):
        for idx, doc in enumerate(tokened_sp):
            yield TaggedDocument(
                words = doc, 
                tags = [idx])

doc2vec_corpus = Doc2VecCorpus()

In [13]:
embed_num = 128
doc2vec_model = Doc2Vec(documents = doc2vec_corpus,dm=2,  vector_size=embed_num, window = 10, min_count = 5)

In [14]:
doc2vec_model.docvecs[0]

test_sen = tokened_sp[5]
doc2vec_model.docvecs.most_similar([doc2vec_model.infer_vector(test_sen)])

  doc2vec_model.docvecs[0]
  doc2vec_model.docvecs.most_similar([doc2vec_model.infer_vector(test_sen)])


[(5, 0.9153035879135132),
 (560, 0.6753875017166138),
 (574, 0.6555848717689514),
 (986, 0.5947901606559753),
 (989, 0.5832820534706116),
 (929, 0.5810511112213135),
 (159, 0.5718024969100952),
 (169, 0.5712486505508423),
 (147, 0.5702451467514038),
 (9, 0.5601426362991333)]

In [22]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
device = torch.device("cuda")

In [16]:
class SentenceDataset(Dataset):
    def __init__(self, dataset, tokenizer, doc2vecmodel, max_len):
        self.sentences = []
        with trange(len(dataset)) as tr:
            for i in tr:
                sen = dataset[i][0]
                sen = tokenizer(sen)
                l = min(max_len, len(sen))
                sen = sen[:l]
                sen = doc2vecmodel.infer_vector(sen)
                self.sentences.append(sen)
        self.labels = [np.int32(i[1]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i],self.labels[i])

    def __len__(self):
        return (len(self.labels))

In [17]:
max_len = 512
data_train = SentenceDataset(dataset_train, vocab.encode_as_pieces,doc2vec_model, max_len)
data_test = SentenceDataset(dataset_test, vocab.encode_as_pieces,doc2vec_model, max_len)

100%|██████████| 1360/1360 [00:05<00:00, 234.39it/s]
100%|██████████| 240/240 [00:01<00:00, 236.88it/s]


In [18]:
batch_size = 64
train_dataloader = DataLoader(data_train, batch_size=batch_size, num_workers=5, shuffle=True)
test_dataloader = DataLoader(data_test, batch_size=batch_size, num_workers=5, shuffle=True)

In [19]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [20]:
class Classifier(nn.Module):
    def __init__(self, input_size, num_classes):
        super(Classifier, self).__init__()
        self.input_size = input_size
        self.num_classes = num_classes

        self.linear = nn.Linear(input_size, 64)
        self.linear2 = nn.Linear(64, num_classes)
        # self.linear3 = nn.Linear(32, num_classes)
        self.relu = nn.ReLU()
        self.dropout = torch.nn.Dropout(p=0.2)
    
        self.fc = nn.Sequential(self.linear, self.dropout, self.relu, self.linear2, self.dropout)
        # self.fc = nn.Sequential(self.linear, self.dropout, self.relu, self.linear2, self.dropout, self.relu, self.linear3, self.dropout)

    def forward(self, x_input):
        output = self.fc(x_input)
        return output

In [23]:
model = Classifier(embed_num, 8).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.0005)
criterion = nn.CrossEntropyLoss()
epochs = 100

with trange(epochs) as tr:
    for i in tr:
        itloss = 0
        trainacc = 0
        testacc = 0
        
        model.train()
        for batch_id, (input, label) in enumerate(train_dataloader):
            optimizer.zero_grad()
            input = input.to(device)
            label = label.long().to(device)
            out = model(input)
            loss = criterion(out, label)
            loss.backward()
            optimizer.step()
            itloss += loss.cpu().item()
            trainacc += calc_accuracy(out,label)


        model.eval()
        for batch_idt, (input, label) in enumerate(test_dataloader):
            input = input.to(device)
            label = label.long().to(device)
            out = model(input)
            testacc += calc_accuracy(out,label)

        tr.set_postfix(trainacc="{0:.3f}".format(trainacc/(batch_id+1)), loss="{0:.3f}".format(itloss/(batch_id+1)),  testacc="{0:.3f}".format(testacc/(batch_idt+1)))

AssertionError: Torch not compiled with CUDA enabled

In [16]:
cate = ["정치","경제","사회", "생활/문화","세계","기술/IT", "연예", "스포츠"]
def softmax(vals, idx):
    valscpu = vals.cpu().detach().squeeze(0)
    a = 0
    for i in valscpu:
        a += np.exp(i)
    
    tmp = []
    for i in valscpu:
        tmp.append(((np.exp(i))/a).item() * 100)
    print(["{}:{:.2f}%".format(cate[i],v) for i,v in enumerate(tmp)])
    return ((np.exp(valscpu[idx]))/a).item() * 100

def test_model(seq, model):
    model.eval()
    sen = vocab.encode_as_pieces(seq)
    l = min(max_len, len(sen))
    sen = sen[:l]
    sen = doc2vec_model.infer_vector(sen)
    sen = torch.tensor(sen).unsqueeze(0).to(device)
    result = model(sen)
    idx = result.argmax().cpu().item()
    print("뉴스의 카테고리는:", cate[idx])
    print("신뢰도는:", "{:.2f}%".format(softmax(result,idx)))

In [17]:
test_model("신형 아이패드 프로에 m1칩 탑재 예정", model)

뉴스의 카테고리는: 기술/IT
['정치:1.18%', '경제:10.40%', '사회:14.44%', '생활/문화:16.44%', '세계:5.13%', '기술/IT:30.20%', '연예:6.33%', '스포츠:15.89%']
신뢰도는: 30.20%


In [532]:
#대통령, 국회의원 지지율 감소
#신형 아이패드 m1칩 탑재 예정
#차명종 서대현 김라희 등 韓7명 호치민3쿠션월드컵 PPPQ통과

In [19]:
torch.save(model.state_dict(), "news_doc2vec.pt")
modelload = Classifier(embed_num, 8).to(device)
modelload.load_state_dict(torch.load("news_doc2vec.pt", device))

<All keys matched successfully>

In [20]:
test_model("신형 아이패드 프로에 m1칩 탑재 예정", model)

뉴스의 카테고리는: 기술/IT
['정치:1.16%', '경제:10.74%', '사회:14.78%', '생활/문화:17.69%', '세계:6.43%', '기술/IT:28.67%', '연예:4.87%', '스포츠:15.66%']
신뢰도는: 28.67%
