<a href="https://colab.research.google.com/github/AcerPing/AceTibaMe/blob/master/sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

In [None]:
import os 
import glob
import pandas as pd

def getdata(mid):
  dn = os.path.dirname(dataset)
  posfn = glob.glob(os.path.join(dn,"aclImdb",mid,"pos","*"))
  negfn = glob.glob(os.path.join(dn,"aclImdb",mid,"neg","*"))
  contents = []
  for fn in posfn + negfn:
    with open(fn,encoding="utf-8") as f:
      contents.append(f.read())
  df = pd.DataFrame({
      "content":contents,
      "sentiment":[1]*len(posfn) + [0]*len(negfn)})
  return df




In [None]:
train_df = getdata("train")
test_df = getdata("test")
train_df
# test_df

In [None]:
dataset

In [None]:
#Tokenizer:把詞化成數字
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])

In [None]:
#0不要用掉，0是拿來padding的
tok.word_index

In [None]:
#sequences:數字序列
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

In [None]:
#截長補短到每一篇文章都是同樣詞數
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=512)
x_test_pad = pad_sequences(x_test_seq, maxlen=512)
pd.DataFrame(x_train_pad)

In [9]:
import numpy as np
y_train = np.array(train_df["sentiment"])
y_test = np.array(test_df["sentiment"])

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Dropout
INPUT_DIM = 3001
OUTPUT_DIM = 128
INPUT_LENGTH = 512
layers = [Embedding(INPUT_DIM,OUTPUT_DIM,mask_zero=True,input_length=INPUT_LENGTH),
      Flatten(),
      Dense(256,activation="relu"),
      Dropout(0.25),
      Dense(2,activation="softmax")]
model = Sequential(layers)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 512, 128)          384128    
_________________________________________________________________
flatten (Flatten)            (None, 65536)             0         
_________________________________________________________________
dense (Dense)                (None, 256)               16777472  
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 514       
Total params: 17,162,114
Trainable params: 17,162,114
Non-trainable params: 0
_________________________________________________________________


In [11]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer="adam",
       metrics=["accuracy"])

In [12]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [
    ModelCheckpoint("embedding.h5", save_best_only=True),
    EarlyStopping(patience=5, restore_best_weights=True)       
]
model.fit(x_train_pad, 
     y_train, 
     batch_size=200,
     validation_split=0.1,
     epochs=50,
     verbose=2,
     callbacks=callbacks)

Epoch 1/50
113/113 - 7s - loss: 0.6176 - accuracy: 0.6883 - val_loss: 0.4965 - val_accuracy: 0.7620
Epoch 2/50
113/113 - 7s - loss: 0.2285 - accuracy: 0.9098 - val_loss: 0.2327 - val_accuracy: 0.9112
Epoch 3/50
113/113 - 6s - loss: 0.0985 - accuracy: 0.9692 - val_loss: 0.4127 - val_accuracy: 0.8396
Epoch 4/50
113/113 - 6s - loss: 0.0255 - accuracy: 0.9959 - val_loss: 0.4809 - val_accuracy: 0.8476
Epoch 5/50
113/113 - 6s - loss: 0.0061 - accuracy: 0.9996 - val_loss: 0.7461 - val_accuracy: 0.8008
Epoch 6/50
113/113 - 6s - loss: 0.0025 - accuracy: 0.9999 - val_loss: 0.6476 - val_accuracy: 0.8368
Epoch 7/50
113/113 - 6s - loss: 0.0012 - accuracy: 0.9999 - val_loss: 0.6720 - val_accuracy: 0.8396


<tensorflow.python.keras.callbacks.History at 0x7ff58dc47f28>

In [13]:
model.evaluate(x_test_pad,y_test)



[0.3102048635482788, 0.8682399988174438]

In [15]:
model.layers

[<tensorflow.python.keras.layers.embeddings.Embedding at 0x7ff58e192f60>,
 <tensorflow.python.keras.layers.core.Flatten at 0x7ff58e19b908>,
 <tensorflow.python.keras.layers.core.Dense at 0x7ff58e19bd68>,
 <tensorflow.python.keras.layers.core.Dropout at 0x7ff58e19b3c8>,
 <tensorflow.python.keras.layers.core.Dense at 0x7ff58e1906d8>]

In [20]:
#get_weights / set_weights
embedding = [Embedding(INPUT_DIM,OUTPUT_DIM,mask_zero=True)]
w = model.layers[0].get_weights()
partial = Sequential(embedding)
partial.layers[0].set_weights(w)
partial.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 128)         384128    
Total params: 384,128
Trainable params: 384,128
Non-trainable params: 0
_________________________________________________________________


In [23]:
result = partial.predict([tok.word_index["bill"]])
# result.shape
print("bill",result[0][0])

bill [ 0.01130736  0.06393312  0.0458255  -0.03482448  0.04902972 -0.07412259
  0.00630185 -0.02319884 -0.01342133  0.01104096  0.03637387 -0.01766019
  0.04585786  0.00265338 -0.03859228  0.02406016  0.00619754  0.01956462
  0.01176043  0.01289857 -0.01995924  0.00321578 -0.03706566 -0.01600274
 -0.01385859  0.02033044  0.03973928 -0.01430763 -0.04980838  0.01309306
  0.02556567 -0.06022691  0.0227276   0.02144845  0.03711756  0.03645701
  0.0325487  -0.00391685  0.03861398 -0.00175148  0.0461261   0.01741857
  0.04588004  0.05636195  0.02605402 -0.02722621  0.04103832  0.0087849
  0.03956901 -0.01034862  0.06194917  0.01646869  0.02881394 -0.00501989
 -0.05506978 -0.02977407  0.0016124   0.0024848   0.00466052  0.03440768
 -0.00372429  0.01440254  0.00340051  0.01062447  0.00324282  0.02043873
  0.01522046 -0.04907376  0.0040732   0.02806294  0.03177455  0.03005659
  0.03660202  0.03306181  0.04891566  0.05320244 -0.01877322  0.00253434
  0.00543627  0.08187457  0.01734823  0.006911 