In [1]:
import os
path = "C:/pytest/"
os.chdir(path)

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [3]:
train = pd.read_csv('뉴스데이터_train.csv', encoding = 'cp949')
test = pd.read_csv('뉴스데이터_test.csv', encoding = 'cp949')

In [4]:
data1_train = train.loc[:,['id', '날짜','분류1','텍스트']]
data1_test = test.loc[:,['id','날짜','분류1','텍스트']]

In [5]:
data1_train_classes = data1_train['분류1']
data1_test_classes = data1_test['분류1']

In [6]:
data1_train_class = []
data1_test_class = []
for word in data1_train_classes:
    data1_train_class.append(word.split(',')[0])
for word in data1_test_classes:
    data1_test_class.append(word.split(',')[0])

In [7]:
data1_train['분류1_상위'] = pd.Series(data1_train_class)
data1_test['분류1_상위'] = pd.Series(data1_test_class)

In [8]:
# data1
data1_train_X = data1_train['텍스트']
data1_train_y = data1_train['분류1_상위']
data1_test_X = data1_test['텍스트']
data1_test_y = data1_test['분류1_상위']

In [9]:
import rhinoMorph
rn = rhinoMorph.startRhino()

filepath:  C:\Anaconda3\lib\site-packages
classpath:  C:\Anaconda3\lib\site-packages\rhinoMorph/lib/rhino.jar
RHINO started!


In [10]:
from tqdm import tqdm

In [11]:
def auto_morphed(rn, data_text):
    morphed_text = []
    for word in tqdm(data_text):
        morphed_text_lst = rhinoMorph.onlyMorph_list(rn, word, pos = ['NNP','NNG','VV','VA','XR','IC','MM','MAG','MAJ'], eomi = True)
        joined_text = ' '.join(morphed_text_lst)
        morphed_text.append(joined_text)
        morphed_text_series = pd.Series(morphed_text)
    return morphed_text_series

In [12]:
# data1
data1_train_X_morphed = auto_morphed(rn, data1_train_X)

100%|█████████████████████████████████████████████████████████████████████████████| 6792/6792 [00:11<00:00, 588.76it/s]


In [13]:
text_len = [len(line.split()) for line in data1_train_X_morphed]
print('최소길이 : {}\n최대 길이 : {}\n평균 길이 : {}\n중위수 길이 : {}\n구간별 최대 길이 : {}'.format(np.min(text_len), np.max(text_len), np.round(np.mean(text_len), 1), np.median(text_len), np.percentile(text_len, [0,25,50,75,99,100])))

최소길이 : 4
최대 길이 : 54
평균 길이 : 19.5
중위수 길이 : 19.0
구간별 최대 길이 : [ 4. 16. 19. 22. 35. 54.]


In [14]:
max_words = 10000
maxlen = 30
embedding_dim = 200

In [15]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(data1_train_X_morphed)
word_index = tokenizer.word_index

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
# sequencing 
data1_train_X_sequencing = tokenizer.texts_to_sequences(data1_train_X_morphed)
# padding
data1_train_X_padding = pad_sequences(data1_train_X_sequencing, maxlen = maxlen)

In [18]:
from sklearn.preprocessing import LabelEncoder
e = LabelEncoder()
e.fit(data1_train_y)
data1_train_y_labeling = e.transform(data1_train_y)

In [19]:
print(e.classes_)

['IT_과학' '경제' '국제' '문화' '미분류' '사회' '스포츠' '정치' '지역']


In [20]:
def to_one_hot(sequences, dimension):
  results = np.zeros((len(sequences), dimension))
  for i, sequence in enumerate(sequences):
    results[i, sequence] = 1
  return results

In [21]:
data1_train_y_1hot = to_one_hot(data1_train_y_labeling, len(e.classes_))

In [22]:
class_number = len(e.classes_)

In [33]:
from keras.models import Sequential
from keras import layers, regularizers
model = Sequential()
model.add(layers.Embedding(input_dim = max_words, output_dim = embedding_dim, input_length = maxlen))

model.add(layers.Conv1D(100, kernel_size = 3,activation = 'relu'))
# model.add(layers.MaxPooling1D(pool_size = 2)) # maxpooling + lstm : 82.8 / maxpooling+flatten : 84.1
model.add(layers.Flatten())
# model.add(layers.Dropout(0.4)) # conv + dropout + lstm + dropout : 84.5

# model.add(layers.LSTM(100)) # 83.6

model.add(layers.Dropout(0.4))
model.add(layers.Dense(32, activation = 'relu'))
model.add(layers.Dense(class_number, activation = 'softmax'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 30, 200)           2000000   
                                                                 
 conv1d_3 (Conv1D)           (None, 28, 100)           60100     
                                                                 
 flatten_1 (Flatten)         (None, 2800)              0         
                                                                 
 dropout_6 (Dropout)         (None, 2800)              0         
                                                                 
 dense_6 (Dense)             (None, 32)                89632     
                                                                 
 dense_7 (Dense)             (None, 9)                 297       
                                                                 
Total params: 2,150,029
Trainable params: 2,150,029
No

In [34]:
model.compile(loss= 'categorical_crossentropy',optimizer= 'rmsprop',metrics = ['acc'])
with tf.device('/CPU:0'):
    history1 = model.fit(data1_train_X_padding, data1_train_y_1hot,epochs = 10, batch_size =32, validation_split= 0.3,verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
# data1 test
data1_test_X_morphed = auto_morphed(rn, data1_test_X)

100%|█████████████████████████████████████████████████████████████████████████████| 2265/2265 [00:02<00:00, 781.89it/s]


In [26]:
# data1
# sequencing
data1_test_X_sequencing = tokenizer.texts_to_sequences(data1_test_X_morphed)
# padding
data1_test_X_padding = pad_sequences(data1_test_X_sequencing, maxlen = maxlen)

In [27]:
# data1 test_y
data1_test_y_labeling= e.transform(data1_test_y)
data1_test_y_1hot = to_one_hot(data1_test_y_labeling, class_number)

In [35]:
# data1 분류1
with tf.device('/CPU:0'):
    eval1 = model.evaluate(data1_test_X_padding, data1_test_y_1hot)

