In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras as tf_keras

In [60]:
# 데이터 준비
df = pd.read_csv('data_files/SMSSpamCollection.tsv', sep = "\t", header = None, 
                 names=['target', 'message'])
df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df.info()                       # 전체 5572
print(df.shape[0] * 0.8)        # 4457.60 > 4500
print(df.shape[0] * 0.8 * 0.8)  # 3566.08 > 3500

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   target   5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
4457.6
3566.0800000000004


In [None]:
# target 컬럼 숫자로 변환 (ham:1, spam:0)
df['target'] = df['target'].map(lambda x: 1 if x == 'ham' else 0)

In [None]:
# train, validation, test 데이터프레임 나누기
train_data = df.iloc[:3500]
validation_data = df.iloc[3500:4500]
test_data = df.iloc[4500:]

In [35]:
# 각각의 데이터프레임을 데이터셋으로 만들기
def dataframe_to_dataset(df, text_column, label_column):
    text = df[text_column].values
    label = df[label_column].values
    dataset = tf.data.Dataset.from_tensor_slices((text, label))
    return dataset

# train 데이터셋
train_dataset = dataframe_to_dataset(train_data, 'message', 'target')
# validation 데이터셋
validation_dataset = dataframe_to_dataset(validation_data, 'message', 'target')
# test 데이터셋
test_dataset = dataframe_to_dataset(test_data, 'message', 'target')

# 배치 크기 32로 설정
train_dataset = train_dataset.batch(32)
validation_dataset = validation_dataset.batch(32)
test_dataset = test_dataset.batch(32)

In [36]:
# 메세지만 있는 데이터셋 만들기
message_only_dataset = train_dataset.map(lambda message, target: message)

In [37]:
# 메세지만 있는 데이터셋으로 단어사전 만들기
text_vectorizer = tf_keras.layers.TextVectorization(max_tokens=20000, 
                                                    output_mode='int', 
                                                    output_sequence_length=100)

text_vectorizer.adapt(message_only_dataset)

In [38]:
# 단어 사전 확인
dictionary = text_vectorizer.get_vocabulary()
print(len(dictionary))
# 단어 사전에 없는 단어 표시 : UNK(unknown) (혹은 oov (out of vocabulary))
dictionary[10:20]

7437


['in', 'me', 'my', 'for', 'your', 'of', 'have', 'it', 'on', 'call']

In [39]:
# (문장 -> 숫자 리스트) 변환기 테스트
for X, y in train_dataset:
    d = text_vectorizer(X)  # 변환 실행 [X: (32, 1) -> X: (32, 300)]
    # batch_size=32라서 X는 32문장이 나오는데, 위에서 한 문장당 300 단어 (정확히는 토큰) 쓰기로 했음
    print(d.shape)
    print(d)
    break

(32, 100)
tf.Tensor(
[[  51  369 5339 ...    0    0    0]
 [  46  293 2673 ...    0    0    0]
 [  54  392   10 ...    0    0    0]
 ...
 [6952  131    3 ...    0    0    0]
 [ 200  140   83 ...    0    0    0]
 [ 170   76   60 ...    0    0    0]], shape=(32, 100), dtype=int64)


In [40]:
# 숫자로 인코딩된 문장을 원래 문장으로 복원
print(d[1][:10].numpy())    # .numpy()를 붙이면 숫자만 볼 수 있다. 배치처리해야 [1]붙이고, 아니라면 없애
for t in d[1]:
    if t != 0:
        print(dictionary[t], end=" ")

[  46  293 2673  524    7 2522    0    0    0    0]
ok lar joking wif u oni 

In [41]:
# Embedding 모델 만들기
input = tf_keras.layers.Input(shape=(None, ))
output = tf_keras.layers.Embedding(input_dim=20000, output_dim=100)(input)

embedding_model = tf_keras.models.Model(input, output)

In [42]:
for review in message_only_dataset:
    # print(review)
    vectorized_message = text_vectorizer(message)                 # 단어 1개 -> 숫자 1개
    embedded_message = embedding_model(vectorized_message)        # 숫자 1개 -> 숫자 100개 
    break

In [43]:
vectorized_message.shape, embedded_message.shape

(TensorShape([32, 100]), TensorShape([32, 100, 100]))

In [53]:
vectorized_train_dataset = train_dataset.map(lambda message, label: (text_vectorizer(message), label))
vectorized_validation_dataset = validation_dataset.map(lambda message, label: (text_vectorizer(message), label))
vectorized_test_dataset = test_dataset.map(lambda message, label: (text_vectorizer(message), label))

In [54]:
# 변경 확인
for X, y in vectorized_train_dataset:
    print(X)
    break

tf.Tensor(
[[  51  369 5339 ...    0    0    0]
 [  46  293 2673 ...    0    0    0]
 [  54  392   10 ...    0    0    0]
 ...
 [6952  131    3 ...    0    0    0]
 [ 200  140   83 ...    0    0    0]
 [ 170   76   60 ...    0    0    0]], shape=(32, 100), dtype=int64)


In [55]:
# 모델 구조 설계 : 텍스트 데이터 처리를 위한 순환신경망 모델

input = tf_keras.layers.Input(shape=(None,))
x = tf_keras.layers.Embedding(input_dim=20000, output_dim=100)(input)   # None, 300, 100
x = tf_keras.layers.LSTM(units=16)(x)
output = tf_keras.layers.Dense(units=1, activation='sigmoid')(x)

model = tf_keras.models.Model(input, output)

model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_2 (Embedding)     (None, None, 100)         2000000   
                                                                 
 lstm_1 (LSTM)               (None, 16)                7488      
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 2007505 (7.66 MB)
Trainable params: 2007505 (7.66 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [56]:
# 모델 학습 설계
model.compile(loss='binary_crossentropy', 
              optimizer = 'adam',
              metrics = ['accuracy'])

In [57]:
# 모델 학습
fit_history = model.fit(vectorized_train_dataset, 
                        epochs=10, 
                        validation_data=vectorized_validation_dataset)

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.evaluate(vectorized_train_dataset), model.evaluate(vectorized_test_dataset)
# train: ([0.394575834274292, 0.8659999966621399],
# test:  [0.3897154927253723, 0.8684701323509216])



([0.394575834274292, 0.8659999966621399],
 [0.3897154927253723, 0.8684701323509216])

# 선생님과 함께

In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras as tf_keras

In [63]:
spam_df = pd.read_csv('data_files/SMSSpamCollection.tsv', 
                      sep='\t', header=None, names=['label', 'msg'])
spam_df.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [65]:
# 특성과 타겟 나누기
from sklearn.preprocessing import LabelEncoder  # ham, spam 2개 뿐이라서 labelEncoder 괜찮
from sklearn.model_selection import train_test_split

X, y = spam_df['msg'], spam_df['label']
y = LabelEncoder().fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [None]:
# 데이터프레임을 데이터셋으로 변환하기
# tf.data.Dataset.from_tensor_slices: dataframe 이나 numpy array 을 tensorflow's dataset 으로 바꾸기
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [70]:
# dataset 구성 확인 : 배치 처리 하지 않아 문제 발생
for X2, y2 in train_dataset:
    print(X2.shape, y2.shape)   # () ()
    print(X2)
    break

() ()
tf.Tensor(b'Orh i tot u say she now still dun believe.', shape=(), dtype=string)


In [None]:
batched_train_dataset = train_dataset.shuffle(buffer_size=3).batch(32)
batched_test_dataset = test_dataset.shuffle(buffer_size=3).batch(32)
# shuffle(buffer_size=3) 데이터 3개 공간 만들고 섞은 데이터에서 3개 추출 한개 쓰면 다시 랜덤에서 추출

In [73]:
# dataset 구성 확인 : 배치 처리 후
for X2, y2 in batched_train_dataset:
    print(X2.shape, y2.shape)   # (32,) (32,)
    print(X2[0])
    break

(32,) (32,)
tf.Tensor(b'Orh i tot u say she now still dun believe.', shape=(), dtype=string)


In [92]:
# 단어 사전 만들기
msg_only_dataset = batched_train_dataset.map(lambda label, msg: msg)

text_vector = tf_keras.layers.TextVectorization(max_tokens=20000, 
                                                output_mode='int',
                                                output_sequence_length=300)

In [93]:
# Embedding 모델 만들기
input = tf_keras.layers.Input(shape=(None,))
output = tf_keras.layers.Embedding(input_dim=20000, output_dim=300)(input)

embedding_model = tf_keras.models.Model(input, output)

In [94]:
for msg in msg_only_dataset:
    vectorized_msg = text_vector(msg)                 # 단어 1개 -> 숫자 1개
    embedded_msg = embedding_model(vectorized_msg)        # 숫자 1개 -> 숫자 100개 
    break

UnimplementedError: Exception encountered when calling layer 'text_vectorization_6' (type TextVectorization).

{{function_node __wrapped__Cast_device_/job:localhost/replica:0/task:0/device:CPU:0}} Cast int32 to string is not supported [Op:Cast] name: 

Call arguments received by layer 'text_vectorization_6' (type TextVectorization):
  • inputs=tf.Tensor(shape=(32,), dtype=int32)