In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

import os, sys
import tensorflow.keras.layers as lay

In [2]:
import rhinoMorph
rn = rhinoMorph.startRhino()

filepath:  C:\Anaconda3\lib\site-packages
classpath:  C:\Anaconda3\lib\site-packages\rhinoMorph/lib/rhino.jar
RHINO started!


###  데이터

In [14]:
class Rawdata:
    def __init__(self):
        train_df = pd.read_csv('train.csv')
        test_df = pd.read_csv('test.csv')
        
        # 훈련데이터 타겟
        self.train_y_cat1 = train_df['cat1']
        self.train_y = train_df['cat3']
        
        # 문장데이터 / 형태소분석
        train_sentence_temp = train_df['overview']
        test_sentence_temp = test_df['overview']
        
        pos = ['NNP','NNG','XR','IC','VV','VA','MM','MAG']
        train_morph = []
        test_morph = []
        
        for i in tqdm(range(len(train_sentence_temp))):
            train_morph.append(rhinoMorph.onlyMorph_list(rn, train_sentence_temp[i], pos = pos))
        for i in tqdm(range(len(test_sentence_temp))):
            test_morph.append(rhinoMorph.onlyMorph_list(rn, test_sentence_temp[i], pos = pos))
            
        self.train_sentence = pd.Series(train_morph)
        self.test_sentence = pd.Series(test_morph)
        
        # 이미지 데이터 / 리사이징 -> (128,128)
        train_img_path = train_df['img_path']
        test_img_path = test_df['img_path']
        train_img = []
        test_img = []
        
        for i in tqdm(range(len(train_img_path))):
            temp = Image.open(train_img_path[i])
            temp = np.array(temp.resize((128,128)))
            train_img.append(temp)
            
        for i in tqdm(range(len(test_img_path))):
            temp = Image.open(test_img_path[i])
            temp = np.array(temp.resize((128,128)))
            test_img.append(temp)
            
        self.train_img = np.array(train_img)
        self.test_img = np.array(test_img)
        
    def load_sentence(self):
        return (self.train_sentence, self.test_sentence)
    
    def load_img(self):
        return (self.train_img, self.test_img)
    
    def load_target(self, cat1 = False):
        if cat1 == True:
            return (self.train_y_cat1, self.train_y)
        return self.train_y

In [15]:
rawdata = Rawdata()

100%|███████████████████████████████████████████████████████████████████████████| 16986/16986 [01:24<00:00, 201.13it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 7280/7280 [00:37<00:00, 193.69it/s]
100%|████████████████████████████████████████████████████████████████████████████| 16986/16986 [03:31<00:00, 80.15it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 7280/7280 [01:32<00:00, 78.96it/s]


In [25]:
np.save('train_img.npy', rawdata.load_img()[0])
np.save('test_img.npy', rawdata.load_img()[1])

In [28]:
pd.DataFrame(rawdata.load_sentence()[0]).to_csv('train_sentence.csv')
pd.DataFrame(rawdata.load_sentence()[1]).to_csv('test_sentence.csv')

In [30]:
from tensorflow.keras.layers import Dense, Dropout, Input

In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

In [32]:
tk = Tokenizer(num_words=10000)

In [33]:
tk.fit_on_texts(rawdata.load_sentence()[0])

In [35]:
X = tk.texts_to_sequences(rawdata.load_sentence()[0])

In [48]:
result = np.zeros((len(X), 10000))
for i in range(len(result)):
    result[i,X[i]] += 1

In [37]:
y = rawdata.load_target(cat1=True)[0]

In [60]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
enc.fit(y)
yy = enc.transform(y)

In [40]:
from tensorflow.keras.layers import Input, Dense, Dropout

In [70]:
from sklearn.model_selection import train_test_split

In [71]:
train_x, test_x , train_y, test_y = train_test_split(result, yy)

In [77]:
inputs = Input(shape=(10000,))

d1 = Dense(64, activation='relu')(inputs)
dr1 = Dropout(0.5)(d1)
d2 = Dense(256, activation='relu')(dr1)
dr2 = Dropout(0.4)(d2)
d3 = Dense(128, activation='relu')(dr2)
dr3 = Dropout(0.4)(d3)

outputs = Dense(len(enc.classes_), activation='softmax')(dr3)

In [78]:
model = tf.keras.models.Model(inputs, outputs)

In [79]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])

In [80]:
model.fit(train_x, train_y, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b101285148>

In [116]:
model.evaluate(test_x, test_y)



[0.6910805702209473, 0.880621612071991]

In [83]:
np.argmax(model.predict(test_x[:5]), axis=1)

array([1, 0, 5, 3, 0], dtype=int64)

In [84]:
enc.classes_[1]

'쇼핑'

In [89]:
result.shape

(16986, 10000)

In [94]:
class0_label = data['cat3'][data['cat1'] == enc.classes_[0]].unique()

In [99]:
sub0_enc = LabelEncoder()
sub0_enc.fit(class0_label)
class0_target = data['cat3'][data['cat1'] == enc.classes_[0]]
class0_y = sub0_enc.transform(class0_target)

In [95]:
class0_text = result[data['cat1'] == enc.classes_[0]]

In [100]:
train_x0, test_x0, train_y0, test_y0 = train_test_split(class0_text, class0_y)

In [102]:
sub0_inputs = Input(shape=(10000,))

sub0_d1 = Dense(64, activation='relu')(sub0_inputs)
sub0_dr1 = Dropout(0.5)(sub0_d1)
sub0_d2 = Dense(256, activation='relu')(sub0_dr1)
sub0_dr2 = Dropout(0.4)(sub0_d2)
sub0_d3 = Dense(128, activation='relu')(sub0_dr2)
sub0_dr3 = Dropout(0.4)(sub0_d3)

sub0_outputs = Dense(len(sub0_enc.classes_), activation='softmax')(sub0_dr3)


sub_model_0 = tf.keras.models.Model(sub0_inputs, sub0_outputs)

In [103]:
sub_model_0.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])

In [113]:
sub_model_0.fit(train_x0, train_y0, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1b17e5ac3c8>

In [114]:
sub_model_0.evaluate(test_x0, test_y0)



[0.6377416253089905, 0.8928024768829346]

In [117]:
temp_result = model.predict(test_x)

In [119]:
temp_result = np.argmax(temp_result, axis=1)

In [120]:
new_x = test_x[temp_result ==0]
new_y = test_y[temp_result ==0]

In [121]:
sub_model_0.evaluate(new_x, new_y)



[47.4585075378418, 0.00486223679035902]