In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [0]:
import os
import glob
import pandas as pd

def read(fp):
    with open(fp, "r", encoding="utf-8") as f:
        content = f.read()
    return content

def read_data(base):
    pos = glob.glob(os.path.join(base, "pos", "*"))
    neg = glob.glob(os.path.join(base, "neg", "*"))
    df = pd.DataFrame({
        "path":(neg + pos),
        "target":([0]*len(neg) + [1] * len(pos))
    })
    df["content"] = df["path"].apply(read)
    return df

In [17]:
dirname = os.path.dirname(dataset)
base = os.path.join(dirname, "aclImdb", "train")
train_df = read_data(base)
base = os.path.join(dirname, "aclImdb", "test")
test_df = read_data(base)
test_df

Unnamed: 0,path,target,content
0,/root/.keras/datasets/aclImdb/test/neg/9801_2.txt,0,"In the 1940s, Veronica Lake made a meteoric ri..."
1,/root/.keras/datasets/aclImdb/test/neg/6813_2.txt,0,After reading the reviews I decided to rent th...
2,/root/.keras/datasets/aclImdb/test/neg/3265_1.txt,0,This movie is so God-awful that it was literal...
3,/root/.keras/datasets/aclImdb/test/neg/7386_4.txt,0,Even though this is the first film by the brok...
4,/root/.keras/datasets/aclImdb/test/neg/3918_1.txt,0,I must admit that this is one of the worst mov...
...,...,...,...
24995,/root/.keras/datasets/aclImdb/test/pos/6384_10...,1,I saw this movie at a screener and its the bes...
24996,/root/.keras/datasets/aclImdb/test/pos/9051_8.txt,1,This 60min film shows just how much fun filmma...
24997,/root/.keras/datasets/aclImdb/test/pos/3995_9.txt,1,It is not surprising that this film was made b...
24998,/root/.keras/datasets/aclImdb/test/pos/12320_1...,1,"The last film of John Huston, the great Americ..."


In [0]:
# 預處理1. 先把文字化成數字
from tensorflow.keras.preprocessing.text import Tokenizer
# 出現太少的詞, 你可以選擇不看, 只留出現次數最高的2000(num_words)
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(train_df["content"])

In [0]:
# 想要看每個單詞被給的編號: 
# tok.word_index

In [0]:
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=256)
x_test_pad = pad_sequences(x_test_seq, maxlen=256)
pd.DataFrame(x_train_pad)

In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten, Dropout, Dense

layers = [
    # 2001 * 64 = 128064
    Embedding(2001, 64, mask_zero=True, input_length=256),
    Flatten(),
    Dense(256, activation="relu"),
    Dropout(0.25),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 256, 64)           128064    
_________________________________________________________________
flatten_3 (Flatten)          (None, 16384)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 256)               4194560   
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 514       
Total params: 4,323,138
Trainable params: 4,323,138
Non-trainable params: 0
_________________________________________________________________


In [0]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
              optimizer="adam",
              metrics=["accuracy"])

In [39]:
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [
    EarlyStopping(patience=3, restore_best_weights=True),
]
model.fit(x_train_pad,
          np.array(train_df["target"]),
          batch_size=200,
          epochs=100,
          validation_split=0.1,
          verbose=2,
          callbacks=callbacks)

Epoch 1/100
113/113 - 15s - loss: 0.5042 - accuracy: 0.7278 - val_loss: 0.3884 - val_accuracy: 0.8468
Epoch 2/100
113/113 - 17s - loss: 0.1969 - accuracy: 0.9240 - val_loss: 0.5474 - val_accuracy: 0.7856
Epoch 3/100
113/113 - 17s - loss: 0.0636 - accuracy: 0.9811 - val_loss: 0.5231 - val_accuracy: 0.8328
Epoch 4/100
113/113 - 17s - loss: 0.0146 - accuracy: 0.9976 - val_loss: 0.6971 - val_accuracy: 0.8248


<tensorflow.python.keras.callbacks.History at 0x7fc9f4084198>

In [40]:
model.evaluate(x_test_pad, np.array(test_df["target"]))



[0.32610467076301575, 0.8577600121498108]

In [44]:
layers = [
    # 2001 * 64 = 128064
    Embedding(2001, 64, mask_zero=True),
]
embedding = Sequential(layers)
w = model.layers[0].get_weights()
embedding.set_weights(w)
embedding.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 64)          128064    
Total params: 128,064
Trainable params: 128,064
Non-trainable params: 0
_________________________________________________________________


In [45]:
embedding.predict([[1]])

array([[[ 0.0261382 , -0.01586764, -0.00016509, -0.02717868,
          0.02160234,  0.03405885, -0.01373189,  0.0137777 ,
          0.00644511,  0.01058135,  0.01371132,  0.0589944 ,
          0.0184916 ,  0.02367891,  0.0174635 , -0.04439824,
          0.00085253,  0.0327115 ,  0.02847491,  0.01086224,
          0.01775193,  0.02886376, -0.0126312 ,  0.02982564,
          0.00899552,  0.02932937, -0.0275043 , -0.00365982,
         -0.04048689,  0.00455712, -0.01618486, -0.00615302,
         -0.03273933,  0.01812304,  0.01681015,  0.02123044,
         -0.00847221, -0.02978302,  0.03369495,  0.05089099,
         -0.02063723, -0.00056442,  0.02306974, -0.00363007,
         -0.00683866, -0.02056108, -0.02128604, -0.03502803,
          0.00841818,  0.01750068,  0.01047075,  0.0297055 ,
         -0.0043635 , -0.00716791,  0.03463337,  0.020104  ,
          0.01837786,  0.03378546,  0.03589766,  0.03752119,
         -0.02386323,  0.03919845, -0.02902404, -0.00570403]]],
      dtype=float32)