In [3]:
from google.colab import drive
import pandas as pd

drive.mount("/content/gdrive", force_remount=True)
default_path='/content/gdrive/MyDrive/'

Mounted at /content/gdrive


In [6]:
import pandas as pd

train = pd.read_csv(default_path+"train.csv")

In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def text2sequence(train_text, max_len=100):
    
    tokenizer = Tokenizer() #keras의 vectorizing 함수 호출
    tokenizer.fit_on_texts(train_text) #train 문장에 fit
    train_X_seq = tokenizer.texts_to_sequences(train_text) #각 토큰들에 정수 부여
    vocab_size = len(tokenizer.word_index) + 1 #모델에 알려줄 vocabulary의 크기 계산
    print('vocab_size : ', vocab_size)
    X_train = pad_sequences(train_X_seq, maxlen = max_len) #설정한 문장의 최대 길이만큼 padding
    
    return X_train, vocab_size, tokenizer

train_X, vocab_size, vectorizer = text2sequence(train['text'], max_len = 100)

vocab_size :  42331


# #4. word embedding
## 1.Keras Embedding Layer

In [9]:
max_len = 100
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length = max_len))

## 2. word2vec

In [11]:
import gensim
word2vec = gensim.models.KeyedVectors.load_word2vec_format(default_path+'GoogleNews-vectors-negative300.bin.gz', binary = True)

In [12]:
import numpy as np

# ocabulary에 있는 토큰들의 벡터를 가져와 embedding matrix에 저장
embedding_matrix = np.zeros((vocab_size, 300)) # 300차원의 임베딩 메트릭스 생성

tokenizer = Tokenizer()
for index, word in enumerate(tokenizer.word_index):  #vocabulary에 있는 토큰들을 하나씩 남겨줍니다.
    if word in word2vec:  # 넘겨받은 토큰이 word2vec에 존재하면(이미 훈련된 토큰이라는 뜻)
        embedding_vector = word2vec[word]  # 해당 토큰에 해당하는 vector을 불러오고
        embedding_matrix[i] = embedding_vector  # 해당 위치의 embedding_matrix에 저장합니다.
    else:
        print("word2vec에 없는 단어입니다.")
        break

In [13]:
# keras embedding layer에 embedding_matrix를 가중치로 주어 이용
model = Sequential()
model.add(Embedding(vocab_size, 300,weights = [embedding_matrix], input_length = max_len))

## 3. glove

In [14]:
# 2. load the whole embedding into memory
glove = dict()
f = open(default_path+'glove.txt')
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype = 'float32')
    glove[word] = vector
f.close()

In [15]:
embedding_matrix = np.zeros((vocab_size, 300)) #300차원의 임베딩 매트릭스 생성

tokenizer = Tokenizer()
for index, word in enumerate(tokenizer.word_index): #vocabulary에 있는 토큰들을 하나씩 넘겨줍니다.
    if word in glove: #넘겨 받은 토큰이 word2vec에 존재하면(이미 훈련이 된 토큰이라는 뜻)
        embedding_vector = glove[word] #해당 토큰에 해당하는 vector를 불러오고
        embedding_matrix[i] = embedding_vector #해당 위치의 embedding_matrix에 저장합니다.
    else:
        print("glove 없는 단어입니다.")
        break

In [17]:
model = Sequential()
model.add(Embedding(vocab_size, 300,weights = [embedding_matrix], input_length = max_len))

## 4. Fasttext

In [18]:
from gensim.models.keyedvectors import KeyedVectors
FastText = gensim.models.KeyedVectors.load_word2vec_format(default_path+'word-embeddings/fasttext/fasttext.vec', binary=True, unicode_errors='ignore')

In [19]:
embedding_matrix = np.zeros((vocab_size, 300)) #300차원의 임베딩 매트릭스 생성

tokenizer = Tokenizer()
for index, word in enumerate(tokenizer.word_index): #vocabulary에 있는 토큰들을 하나씩 넘겨줍니다.
    if word in word2vec: #넘겨 받은 토큰이 word2vec에 존재하면(이미 훈련이 된 토큰이라는 뜻)
        embedding_vector = word2vec[word] #해당 토큰에 해당하는 vector를 불러오고
        embedding_matrix[i] = embedding_vector #해당 위치의 embedding_matrix에 저장합니다.

In [21]:
model = Sequential()
model.add(Embedding(vocab_size, 300,weights = [embedding_matrix], input_length = max_len))

# #5. Modeling

## 간단한 전처리 + 형태소 분석

In [29]:
pip install konlpy

Collecting konlpy
  Downloading konlpy-0.5.2-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 1.4 MB/s 
[?25hCollecting beautifulsoup4==4.6.0
  Downloading beautifulsoup4-4.6.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.2 MB/s 
[?25hCollecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 42.1 MB/s 
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: JPype1, colorama, beautifulsoup4, konlpy
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.6.3
    Uninstalling beautifulsoup4-4.6.3:
      Successfully uninstalled beautifulsoup4-4.6.3
Successfully installed JPype1-1.3.0 beautifulsoup4-4.6.0 colorama-0.4.4 konlpy-0.5.2


In [36]:
train = pd.read_csv(default_path+"train.csv")

In [41]:
from konlpy.tag import Okt
import re
import tqdm 

def text_preprocessing(text_list):
    
    stopwords = ['을', '를', '이', '가', '은', '는', 'null'] 
    tokenizer = Okt() 
    token_list = []
    
    for text in tqdm.tqdm(text_list):
        txt = re.sub('[^가-힣a-z]', ' ', text) 
        token = tokenizer.morphs(txt) 
        token = [t for t in token if t not in stopwords or type(t) != float] 
        token_list.append(token)
        
    return token_list, tokenizer

train['token'], okt = text_preprocessing(train['text'])

100%|██████████| 54879/54879 [02:04<00:00, 439.84it/s]


## vectorization

In [42]:
def text2sequence(train_text, max_len=1000):
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_text)
    train_X_seq = tokenizer.texts_to_sequences(train_text)
    vocab_size = len(tokenizer.word_index) + 1
    print('vocab_size : ', vocab_size)
    X_train = pad_sequences(train_X_seq, maxlen = max_len)
    return X_train, vocab_size, tokenizer

train_y = train['author']
train_X, vocab_size, vectorizer = text2sequence(train['token'], max_len = 100)
print(train_X.shape, train_y.shape)

vocab_size :  36342
(54879, 100) (54879,)


## Embedding

In [45]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format(default_path+'GoogleNews-vectors-negative300.bin.gz', binary = True)
embedding_matrix = np.zeros((vocab_size, 300))

tokenizer = Tokenizer()
for index, word in enumerate(tokenizer.word_index):
    if word in word2vec:
        embedding_vector = word2vec[word] 
        embedding_mxtrix[i] = embedding_vector 
    else:
        print("word2vec에 없는 단어입니다.")
        break

## Modeling

In [46]:
def LSTM(vocab_size, max_len=1000):
    model = Sequential()
    model.add(Embedding(vocab_size, 300,weights = [embedding_matrx], input_length = max_len)) #임베딩 가중치 적용 코드
    model.add(SpatialDropout1D(0.3))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu', kernel_regularizer = regularizers.l2(0.001)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')
    model.summary()
    return model