In [3]:
import pandas as pd

csv = pd.read_csv('train_data.csv')
csv.info()

print(csv.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52948 entries, 0 to 52947
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   52948 non-null  object
 1   price   52948 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 827.4+ KB
                      title  price
0            s8 부품용 무선충전 패드  10000
1                     갤럭시S1  10000
2             아이폰xs max 케이스  10000
3  아이폰 7/7+ 8/8+ 홈버튼 블랙 화이트  10000
4          IRIVER 이어폰 (새상품)  10000


### 설정

In [4]:
import tensorflow_datasets as tfds
import tensorflow as tf

In [5]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])
    plt.show()

### 입력 파이프라인 설정

In [6]:
from sklearn.model_selection import train_test_split

titles = csv['title']
prices = csv['price']

In [7]:
from konlpy.tag import Okt
from IPython.display import clear_output

okt = Okt()

splited_titles = []
count = 0
for title in titles:
    morphemes = []
    title_pos = okt.pos(title, norm=True)
    for morpheme in title_pos:
        morphemes.append(morpheme[0])
    splited_titles.append(morphemes)
    count += 1
    clear_output(wait=True)
    print(f"{count} / {len(titles)}")

52948 / 52948


In [35]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np


t = Tokenizer()
t.fit_on_texts(splited_titles)
vocab_size = len(t.word_index) + 1

encoded_docs = t.texts_to_sequences(splited_titles)

max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [37]:
# load embedding as a dict
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename,'r')
    lines = file.readlines()[1:]
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding

# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = embedding.get(word)
    return weight_matrix

# load embedding from file
raw_embedding = load_embedding('embedding_word2vec.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, t.word_index)


In [39]:
from keras.layers import Embedding
e = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=4, trainable=False)

In [63]:
from numpy import array

model = Sequential()
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the modelx
print(model.summary())
# fit the model
model.fit(padded_docs, splited_titles, epochs=50, verbose=0)

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 4, 100)            811400    
_________________________________________________________________
flatten_7 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 401       
Total params: 811,801
Trainable params: 401
Non-trainable params: 811,400
_________________________________________________________________
None


ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {"<class \'str\'>"})'})

In [16]:
from gensim.models.word2vec import Word2Vec

# model_wv = Word2Vec(sentences = splited_titles, min_count = 1)
# model_wv.save("word2vec.model")

print('Vocabulary size: {}'.format(len(splited_titles)))

train_titles = []
count = 0
for title in splited_titles:
    vectors = []
    for morpheme in title:
        vectors.append(morpheme)
    train_titles.append(vectors)
    count += 1
    clear_output(wait=True)
    print(f"{count} / {len(splited_titles)}")

50935 / 52948


In [17]:
trainData, testData, trainResult, testResult = train_test_split(train_titles, prices)

print(len(trainData))
print(len(testData))
print(len(trainResult))
print(len(testResult))

39711
13237
39711
13237


In [18]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(1000, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

model.compile(loss=tf.keras.losses.MeanSquaredError(),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          64000     
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         66048     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 175,489
Trainable params: 175,489
Non-trainable params: 0
__________________________________________________