In [1]:
import pandas as pd

train = pd.read_csv('./data/news_train.csv')

In [2]:
import re
import tqdm
from khaiii import KhaiiiApi
from konlpy.tag import Okt

def text_preprocessing(text_list):
    
    stopwords = ['을', '를', '이', '가', '은', '는', 'null']
    tokenizer = KhaiiiApi()
    token_list = []

    for text in tqdm.tqdm(text_list):
        txt = re.sub('[^가-힣a-z]', ' ', text.lower()).strip()
        if not txt: txt = 'null'
        morphs = []
        for word in tokenizer.analyze(txt):
            for morph in word.morphs:
                morphs.append(morph.lex) #품사는 morph.tag로 표현 가능
        token_list.append([t for t in morphs if t not in stopwords or type(t) != float])
    return token_list, tokenizer

train['token'], tokenizer = text_preprocessing(train['content'])

100%|██████████| 118745/118745 [01:54<00:00, 1041.09it/s]


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def text2sequence(train_text, max_len=1000):
    vectorizer = Tokenizer()
    vectorizer.fit_on_texts(train_text)
    train_X_seq = vectorizer.texts_to_sequences(train_text)
    vocab_size = len(vectorizer.word_index) + 1
    print('vocab_size : ', vocab_size)
    X_train = pad_sequences(train_X_seq, maxlen = max_len)
    return X_train, vocab_size, vectorizer

train_y = train['info']
train_X, vocab_size, vectorizer = text2sequence(train['token'], max_len = 100)
print(train_X.shape, train_y.shape)

vocab_size :  48514
(118745, 100) (118745,)


In [4]:
import gensim
import numpy as np

# word2vec = gensim.models.Word2Vec.load('ko.bin')
#word2vec = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True)
word2vec = gensim.models.Word2Vec(sentences=train['token'].values, min_count=1, workers=4)
# word2vec.build_vocab(list(train['token'].values), update=True)
# word2vec.train(list(train['token'].values), total_examples=len(list(train['token'].values)), epochs=10)
print(f'vocab_size : {vocab_size}')
print(f'word2vec_vocab_size : {word2vec.wv.vectors.shape}')

vocab_size : 48514
word2vec_vocab_size : (48513, 100)


In [5]:
def embedding(vocabulary):
    num = 0
    embedding_matrix = np.zeros((vocab_size, 100))
    for index, word in enumerate((tqdm.tqdm(vocabulary))):
        if word in word2vec:
            embedding_vector = word2vec[word] 
            embedding_matrix[index] = embedding_vector 
        else:
            #print(f"'{word}'는 word2vec에 없는 단어입니다.")
            num += 1
    print(f"총 {num}개의 단어가 word2vec에 없습니다.")
    return embedding_matrix

embedding_matrix = embedding(vectorizer.word_index)
print(f'embedding_matrix shape : {embedding_matrix.shape}') #word2vec.wv.vectors

100%|██████████| 48513/48513 [00:00<00:00, 82484.22it/s]총 0개의 단어가 word2vec에 없습니다.
embedding_matrix shape : (48514, 100)



In [10]:
import tensorflow as tf
from tensorflow.keras import Sequential, regularizers
from tensorflow.keras.layers import Embedding, LSTM, Dropout,Dense, SpatialDropout1D

def model(vocab_size, max_len=100):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, weights = [embedding_matrix], input_length = max_len, trainable=False)) #임베딩 가중치 적용 코드
    model.add(SpatialDropout1D(0.3))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu', kernel_regularizer = regularizers.l2(0.001)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
    model.summary()
    return model

model = model(len(embedding_matrix))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          4851400   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 100, 100)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                42240     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 4,897,865
Trainable params: 46,465
Non-trainable params: 4,851,400
_______________________________________

In [11]:
model.fit(train_X, train_y, batch_size=128, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f6e7c367220>

In [32]:
test = pd.read_csv('./data/news_test.csv')
test['token'], tokenizer = text_preprocessing(test['content'])
test_X_seq = vectorizer.texts_to_sequences(test['token'])
test_X = pad_sequences(test_X_seq, maxlen = 100)
result = model.predict(test_X)
result = np.where(result>0.5, 1, 0)
test['info'] = result


100%|██████████| 142565/142565 [02:17<00:00, 1033.96it/s]


In [56]:
submission = pd.read_csv('./data/sample_submission.csv')
# for idx, id in enumerate(tqdm.tqdm(submission['id'].values)):
#     submission['info'].iloc[idx] = test[id == test['id']]['info'].values
submission['info'] = test['info']
submission.to_csv('simple_baseline_if_in_train_one.csv', index = False)