# Tokenizer

In [5]:
import wget

wget.download("https://github.com/euphoris/datasets/raw/master/imdb.zip")

'imdb (1).zip'

In [4]:
import pandas as pd
df = pd.read_csv('imdb.zip')
df.head()

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [None]:
# 다른 방법
"""
import requests

res = requests.get("https://github.com/euphoris/datasets/raw/master/imdb.zip")
"""

In [6]:
import tensorflow as tf

tk = tf.keras.preprocessing.text.Tokenizer(num_words = 2000, oov_token = "<unk>")

In [8]:
tk.fit_on_texts(df["review"])

In [9]:
tk.word_index

{'<unk>': 1,
 'the': 2,
 'and': 3,
 'a': 4,
 'of': 5,
 'is': 6,
 'this': 7,
 'i': 8,
 'it': 9,
 'to': 10,
 'in': 11,
 'was': 12,
 'movie': 13,
 'film': 14,
 'that': 15,
 'for': 16,
 'as': 17,
 'but': 18,
 'with': 19,
 'one': 20,
 'on': 21,
 'you': 22,
 'are': 23,
 'not': 24,
 'bad': 25,
 "it's": 26,
 'very': 27,
 'all': 28,
 'just': 29,
 'so': 30,
 'good': 31,
 'at': 32,
 'an': 33,
 'be': 34,
 'there': 35,
 'about': 36,
 'have': 37,
 'by': 38,
 'like': 39,
 'from': 40,
 'if': 41,
 'acting': 42,
 'time': 43,
 'out': 44,
 'his': 45,
 'or': 46,
 'really': 47,
 'great': 48,
 'even': 49,
 'he': 50,
 'who': 51,
 'were': 52,
 'has': 53,
 'see': 54,
 'my': 55,
 'characters': 56,
 'well': 57,
 'most': 58,
 'how': 59,
 'more': 60,
 'no': 61,
 'only': 62,
 'when': 63,
 'ever': 64,
 '10': 65,
 'movies': 66,
 'plot': 67,
 'story': 68,
 'made': 69,
 'some': 70,
 'they': 71,
 'best': 72,
 'because': 73,
 'your': 74,
 'can': 75,
 'also': 76,
 "don't": 77,
 'films': 78,
 'than': 79,
 'its': 80,
 'scrip

In [10]:
import joblib

joblib.dump(tk,"tokenizer.pkl")

['tokenizer.pkl']

# 전처리

In [11]:
import pandas as pd
import joblib

df = pd.read_csv("imdb.zip")
tk = joblib.load("tokenizer.pkl")

In [12]:
seqs = tk.texts_to_sequences(df["review"])

In [13]:
seqs

[[4, 27, 27, 27, 287, 407, 1217, 13, 36, 4, 1218, 1219, 408, 142],
 [24,
  522,
  51,
  12,
  60,
  409,
  2,
  736,
  56,
  46,
  2,
  337,
  1220,
  288,
  5,
  737,
  738,
  44],
 [1221,
  1222,
  19,
  209,
  233,
  3,
  338,
  184,
  739,
  2,
  13,
  289,
  740,
  49,
  60,
  339,
  17,
  2,
  42,
  12,
  290,
  3,
  2,
  67,
  3,
  261,
  210,
  340,
  1223],
 [27, 116, 128, 46, 234, 10, 523, 5],
 [2,
  72,
  129,
  11,
  2,
  13,
  12,
  63,
  1224,
  6,
  410,
  10,
  185,
  4,
  411,
  15,
  741,
  742,
  186,
  45,
  524],
 [2,
  412,
  5,
  2,
  13,
  291,
  143,
  743,
  525,
  41,
  26,
  36,
  1225,
  9,
  341,
  8,
  342,
  73,
  26,
  744],
 [262, 144, 413],
 [155, 2, 13, 526, 3, 156, 9, 12, 4, 31, 1226, 31, 1227, 16, 235],
 [4, 343, 187],
 [188, 2, 292, 5, 745, 1228, 17, 2, 1229, 1230],
 [3, 157, 1231, 1232, 52, 414],
 [2, 13, 527, 4, 211, 5, 1233, 32, 26, 72, 69, 9, 107, 27, 746],
 [2, 415, 52, 2, 72, 3, 2, 1234, 52, 30, 293],
 [9, 12, 30, 263],
 [7, 6, 4, 27, 145, 2

In [14]:
tk.index_word[4], tk.index_word[27],tk.index_word[287],tk.index_word[407]

('a', 'very', 'slow', 'moving')

In [16]:
seq = seqs[0]
list(range(0,len(seq)-4))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [19]:
data = []
for seq in seqs:
    for i in range(0,len(seq)-4):
        data.append((seq[i:i+4],seq[i+4]))

In [20]:
import random

random.shuffle(data)
data

[([157, 1, 12, 69], 727),
 ([667, 1787, 668, 992], 3),
 ([87, 509, 3, 1142], 3),
 ([6, 553, 27, 1], 11),
 ([4, 1, 1, 2], 1),
 ([565, 9, 12, 57], 566),
 ([1, 30, 710, 11], 2),
 ([2, 1477, 878, 757], 8),
 ([2, 1, 2, 1], 121),
 ([12, 395, 17, 1], 3),
 ([6, 2, 1, 5], 1095),
 ([17, 71, 133, 152], 4),
 ([425, 1, 1156, 1157], 3),
 ([2, 229, 68, 291], 4),
 ([428, 333, 2, 13], 18),
 ([59, 10, 1, 45], 78),
 ([95, 9, 6, 36], 7),
 ([1584, 5, 2, 27], 1585),
 ([70, 1042, 683, 684], 118),
 ([221, 56, 73, 439], 28),
 ([645, 15, 440, 23], 4),
 ([7, 1465, 3, 8], 875),
 ([49, 33, 1167, 17], 4),
 ([1, 1, 3, 2], 372),
 ([5, 55, 72, 839], 1401),
 ([8, 100, 24, 225], 95),
 ([896, 16, 2, 14], 94),
 ([46, 429, 73, 5], 33),
 ([36, 1006, 1007, 6], 2),
 ([2, 1555, 5, 1556], 612),
 ([98, 71, 172, 33], 122),
 ([3, 20, 5, 2], 72),
 ([715, 39, 83, 68], 617),
 ([1982, 98, 1983, 1984], 1041),
 ([993, 1789, 270, 4], 1790),
 ([1, 59, 48, 50], 192),
 ([1, 2, 1, 16], 24),
 ([7, 6, 20, 5], 2),
 ([153, 31, 11, 7], 14),
 ([8,

In [25]:
import numpy as np

xs = np.array([x for x,y in data])
ys = np.array([y for x,y in data])

In [22]:
xs

array([[ 157,    1,   12,   69],
       [ 667, 1787,  668,  992],
       [  87,  509,    3, 1142],
       ...,
       [  35,    6,   61,   67],
       [  54, 1432,  861,  270],
       [   1,  148,    1,  271]])

In [26]:
ys

array([727,   3,   3, ..., 127, 862,  37])

In [27]:
joblib.dump((xs,ys),"im-data.pkl")

['im-data.pkl']

# 학습

In [1]:
import joblib

tk = joblib.load("tokenizer.pkl")
xs,ys = joblib.load("im-data.pkl")

In [2]:
xs

array([[ 157,    1,   12,   69],
       [ 667, 1787,  668,  992],
       [  87,  509,    3, 1142],
       ...,
       [  35,    6,   61,   67],
       [  54, 1432,  861,  270],
       [   1,  148,    1,  271]])

In [3]:
import tensorflow as tf

In [8]:
# 단어 개수에 패딩 숫자를 추가한다.
NUM_WORD = tk.num_words + 1

In [5]:
# tk의 단어 리스트를 확인한다.
tk.index_word

{1: '<unk>',
 2: 'the',
 3: 'and',
 4: 'a',
 5: 'of',
 6: 'is',
 7: 'this',
 8: 'i',
 9: 'it',
 10: 'to',
 11: 'in',
 12: 'was',
 13: 'movie',
 14: 'film',
 15: 'that',
 16: 'for',
 17: 'as',
 18: 'but',
 19: 'with',
 20: 'one',
 21: 'on',
 22: 'you',
 23: 'are',
 24: 'not',
 25: 'bad',
 26: "it's",
 27: 'very',
 28: 'all',
 29: 'just',
 30: 'so',
 31: 'good',
 32: 'at',
 33: 'an',
 34: 'be',
 35: 'there',
 36: 'about',
 37: 'have',
 38: 'by',
 39: 'like',
 40: 'from',
 41: 'if',
 42: 'acting',
 43: 'time',
 44: 'out',
 45: 'his',
 46: 'or',
 47: 'really',
 48: 'great',
 49: 'even',
 50: 'he',
 51: 'who',
 52: 'were',
 53: 'has',
 54: 'see',
 55: 'my',
 56: 'characters',
 57: 'well',
 58: 'most',
 59: 'how',
 60: 'more',
 61: 'no',
 62: 'only',
 63: 'when',
 64: 'ever',
 65: '10',
 66: 'movies',
 67: 'plot',
 68: 'story',
 69: 'made',
 70: 'some',
 71: 'they',
 72: 'best',
 73: 'because',
 74: 'your',
 75: 'can',
 76: 'also',
 77: "don't",
 78: 'films',
 79: 'than',
 80: 'its',
 81: 's

In [6]:
# 신경망에 데이터를 추가하려면 원-핫-인코딩으로 바꿔야한다.
# 임베딩 레이어에 넣으면 원핫인코딩을 해준다.
xs[0]

array([157,   1,  12,  69])

In [9]:
emb1 = tf.keras.layers.Embedding(input_dim=NUM_WORD, output_dim=8)

In [11]:
lm = tf.keras.Sequential([emb1,
                          tf.keras.layers.GlobalAveragePooling1D(),  
                          tf.keras.layers.Dense(8,activation="relu"),
                          tf.keras.layers.Dense(NUM_WORD)])

In [12]:
lm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 8)           16008     
_________________________________________________________________
global_average_pooling1d (Gl (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 8)                 72        
_________________________________________________________________
dense_1 (Dense)              (None, 2001)              18009     
Total params: 34,089
Trainable params: 34,089
Non-trainable params: 0
_________________________________________________________________


In [14]:
lm.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
           optimizer = "adam",
           metrics = ["accuracy"])

In [15]:
lm.fit(xs,ys,epochs=1)



<tensorflow.python.keras.callbacks.History at 0x1769f6e6bb0>

In [16]:
lm.save("lm.krs")

INFO:tensorflow:Assets written to: lm.krs\assets


# 단어임베딩

In [19]:
e = emb1.embeddings.numpy()
e

array([[-0.03655555,  0.04855568,  0.00845027, ...,  0.02097121,
        -0.02370548,  0.04710528],
       [-0.29190552,  0.34135807,  0.38596955, ...,  0.35832945,
        -0.33485508,  0.32515934],
       [-0.29935396,  0.34875542,  0.25040045, ...,  0.2539849 ,
        -0.30680662,  0.3311435 ],
       ...,
       [-0.01446306,  0.06047182, -0.00207875, ...,  0.05196074,
        -0.06755789,  0.0010679 ],
       [-0.05285018,  0.03412558,  0.07062058, ..., -0.00049395,
         0.01018862,  0.00154309],
       [ 0.01769087, -0.04614148, -0.04967233, ...,  0.04305117,
         0.00496145, -0.04164815]], dtype=float32)

In [21]:
w = emb1.get_weights()[0]
w

array([[-0.03655555,  0.04855568,  0.00845027, ...,  0.02097121,
        -0.02370548,  0.04710528],
       [-0.29190552,  0.34135807,  0.38596955, ...,  0.35832945,
        -0.33485508,  0.32515934],
       [-0.29935396,  0.34875542,  0.25040045, ...,  0.2539849 ,
        -0.30680662,  0.3311435 ],
       ...,
       [-0.01446306,  0.06047182, -0.00207875, ...,  0.05196074,
        -0.06755789,  0.0010679 ],
       [-0.05285018,  0.03412558,  0.07062058, ..., -0.00049395,
         0.01018862,  0.00154309],
       [ 0.01769087, -0.04614148, -0.04967233, ...,  0.04305117,
         0.00496145, -0.04164815]], dtype=float32)

In [23]:
import numpy as np
np.array_equal(e,w)

True

In [24]:
np.savez("word-emb.npz",emb = e)

# GlobalAveragePooling1D

In [25]:
import tensorflow as tf
import numpy as np

In [27]:
# 임베딩된 단어라 볼 수 있다.

x = np.array([[[1,2,3],[3,6,9]]], dtype="float32")
x

array([[[1., 2., 3.],
        [3., 6., 9.]]], dtype=float32)

In [28]:
# 1은 데이터가 1개가 들어갔다.
# 3은 벡터의 길이
# 2는 이러한 벡터가 2개있다.
# 즉 (1,2,3)은 1은 총 1개의 단어가 들어간다.
x.shape

(1, 2, 3)

In [29]:
avg = tf.keras.layers.GlobalAveragePooling1D()

<tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D at 0x1769d55a490>

In [31]:
y = avg(x).numpy()
y

array([[2., 4., 6.]], dtype=float32)

In [32]:
y.shape

(1, 3)

# 다음 토큰의 예측 확률

In [34]:
import joblib

tk = joblib.load("tokenizer.pkl")
xs,ys = joblib.load("im-data.pkl")

In [35]:
import tensorflow as tf

lm = tf.keras.models.load_model("lm.krs")

In [36]:
x = xs[0:1]
y = ys[0]

In [37]:
x

array([[157,   1,  12,  69]])

In [38]:
y

727

In [39]:
[tk.index_word[i] for i in x[0]]

['those', '<unk>', 'was', 'made']

In [41]:
tk.index_word[y]

'wrong'

In [42]:
import numpy as np

logit = lm.predict(x.astype("float32"))
logit.shape

(1, 2001)

In [43]:
logit

array([[-4.064345 ,  3.9775379,  3.4067967, ..., -4.0593133, -3.9768836,
        -3.9826217]], dtype=float32)

In [44]:
p = tf.nn.softmax(logit).numpy()

In [45]:
p

array([[2.9885439e-05, 9.2897676e-02, 5.2497078e-02, ..., 3.0036192e-05,
        3.2616961e-05, 3.2430347e-05]], dtype=float32)

In [46]:
p[0,y]

0.0003761839

In [47]:
p.argmax()

1

# 전이학습

In [48]:
import pandas as pd

df = pd.read_csv("imdb.zip")

In [50]:
import joblib

tk = joblib.load("tokenizer.pkl")

In [51]:
# 텍스트를 숫자로 변형
seqs = tk.texts_to_sequences(df["review"])

In [52]:
seqs[0]

[4, 27, 27, 27, 287, 407, 1217, 13, 36, 4, 1218, 1219, 408, 142]

In [53]:
import tensorflow as tf

pads = tf.keras.preprocessing.sequence.pad_sequences(seqs)

In [55]:
pads[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    4,   27,   27,   27,  287,  407, 1217,
         13,   36,    4, 1218, 1219,  408,  142])

In [56]:
import numpy as np

z = np.load("word-emb.npz")
e = z["emb"]

## 감성분석

In [57]:
emb2 = tf.keras.layers.Embedding(input_dim= tk.num_words+1,
                                 output_dim = 8,
                                 embeddings_initializer = tf.keras.initializers.Constant(e))

In [58]:
model = tf.keras.Sequential([emb2,
                             tf.keras.layers.GlobalAveragePooling1D(),
                             tf.keras.layers.Dense(8, activation="relu"),
                             tf.keras.layers.Dense(1,activation="sigmoid")])

In [59]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 8)           16008     
_________________________________________________________________
global_average_pooling1d_2 ( (None, 8)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 9         
Total params: 16,089
Trainable params: 16,089
Non-trainable params: 0
_________________________________________________________________


In [60]:
model.compile(loss = "binary_crossentropy",
              optimizer = "adam",
              metrics = ["accuracy"])

In [61]:
y = df["sentiment"].values

In [62]:
model.fit(pads, y)



<tensorflow.python.keras.callbacks.History at 0x176a1bb1820>