In [43]:
import pandas as pd
import random

data_csv = pd.read_csv('train_data.csv')
titles_csv = data_csv['title']
prices_csv = data_csv['price']

In [2]:
import pickle
from konlpy.tag import Okt
from IPython.display import clear_output
import random

try:
    with open("titles_words.bin", "rb") as f:
        titles_words = pickle.load(f)
    with open("dictionary.bin", "rb") as f:
        dictionary = pickle.load(f)
    with open("titles_ids.bin", "rb") as f:
        titles_ids = pickle.load(f)
        
except Exception as e:
    okt = Okt()
    words_set = set()
    titles_words = []
    count = 1
    for title in titles_csv:
        title_pos = okt.pos(title, norm=True)
        words = []
        for word in title_pos:
            words_set.add(word[0])
            words.append(word[0])
        titles_words.append(words)
        clear_output(wait=True)
        print(f"{count} / {len(titles_csv)}")
        count += 1
        
    dictionary = list(words_set)
    random.shuffle(dictionary)
    titles_ids = []
    count = 1
    for title in titles_words:
        words_id = []
        for words in title:
            words_id.append(dictionary.index(words))
        titles_ids.append(words_id)
        clear_output(wait=True)
        print(f"{count} / {len(titles_words)}")
        count += 1
        
    with open("titles_words.bin", "wb") as f:
        pickle.dump(titles_words, f)
    with open("dictionary.bin", "wb") as f:
        pickle.dump(dictionary, f)
    with open("titles_ids.bin", "wb") as f:
        pickle.dump(titles_ids, f)

In [3]:
print(titles_words[0])
print(titles_ids[0])
print(dictionary[titles_ids[0][0]])

['s', '8', '부품', '용', '무선', '충전', '패드']
[2766, 1235, 23, 5052, 702, 3126, 2617]
s


In [4]:
max_len = max(len(title_ids) for title_ids in titles_ids)
print(max_len)

29


In [5]:
from keras.preprocessing import sequence
import numpy as np

titles_ids_np = sequence.pad_sequences(titles_ids, maxlen=max_len, padding='post')
print(titles_ids_np)

prices_np = np.array([[price] for price in prices_csv])
print(prices_np)

print(type(titles_ids_np), type(prices_np))

[[2766 1235   23 ...    0    0    0]
 [5763 1175 2898 ...    0    0    0]
 [1606 1820 5297 ...    0    0    0]
 ...
 [5763 1175 6402 ...    0    0    0]
 [1606 5667 3915 ...    0    0    0]
 [4686 1922 1477 ...    0    0    0]]
[[ 10000]
 [ 10000]
 [ 10000]
 ...
 [ 50000]
 [200000]
 [270000]]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [67]:
import random

index = [i for i in range(len(titles_ids_np))]
random.shuffle(index)

train_len = int(len(titles_ids_np) * 0.8)
train_index = index[:train_len]
test_index = index[train_len:]

print(len(titles_ids_np))
print(len(train_index))
print(len(test_index))

52948
42358
10590


In [90]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

X_train = titles_ids_np[train_index]
X_test = titles_ids_np[test_index]

y_test = prices_np[test_index]

mms = MinMaxScaler()
mms.fit(prices_np)
mmt_y = mms.transform(prices_np)

mmt_y_train = mmt_y[train_index]
mmt_y_test = mmt_y[test_index]

# ss = StandardScaler()
# ss.fit(prices_np)
# st_y = ss.transform(prices_np)

# st_y_train = st_y[train_index]
# st_y_test = st_y[test_index]

print(X_test)
print(y_test)

[[1606 1235    0 ...    0    0    0]
 [5763 1175 6402 ...    0    0    0]
 [3294 7895 4885 ...    0    0    0]
 ...
 [5763 1175 4284 ...    0    0    0]
 [5763 2766 3481 ...    0    0    0]
 [5763 2766 2766 ...    0    0    0]]
[[240000]
 [610000]
 [ 60000]
 ...
 [320000]
 [ 15000]
 [220000]]


# Model  Parameter Tunning 
##### 1. Epoch 변경
##### 2. Activation 변경
##### 3. Optimizer 변경
##### 4. Dropout layer 추가 및 변경
##### 5. GRU, LSTM layer 추가 및 변경 (, return_sequences=True 변경)
##### 6. Embed size 수정

In [99]:
import tensorflow as tf

vocab_size = len(dictionary)
embed_size = 128
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embed_size, input_shape=[None]),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.GRU(128),
    # tf.keras.layers.GRU(64),
    # tf.keras.layers.GRU(32),
    # tf.keras.layers.GRU(16),
    # tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1)
    # tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer="adam", metrics=["mae"])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 128)         1076480   
_________________________________________________________________
gru_6 (GRU)                  (None, None, 128)         99072     
_________________________________________________________________
gru_7 (GRU)                  (None, 128)               99072     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 1,274,753
Trainable params: 1,274,753
Non-trainable params: 0
_________________________________________________________________


In [100]:
model.fit(X_train, mmt_y_train, epochs=5, verbose=1)
# model.save("PredictPrice_model.h5")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [114]:
price_predict = model.predict(X_test)
print(np.c_[y_test, mms.inverse_transform(price_predict)])

[[240000.       160806.734375]
 [610000.       598937.      ]
 [ 60000.       170440.3125  ]
 ...
 [320000.       265466.21875 ]
 [ 15000.        85582.78125 ]
 [220000.       301973.      ]]


In [93]:
mmt_test_mse, mmt_test_mae = model.evaluate(X_test, mmt_y_test)
print(f"mmt_test_mse = {mmt_test_mse}, mmt_test_mae = {mmt_test_mae}")

mmt_test_mse = 7.637197995791212e-05, mmt_test_mae = 0.004520006477832794


In [94]:
# history = model.fit(X_train, st_y_train, epochs=5, verbose=1)

# st_test_mse, st_test_mae = model.evaluate(X_test, st_y_test)
# print(f"st_test_mse = {st_test_mse}, st_test_mae = {st_test_mae}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
