<a href="https://colab.research.google.com/github/Elwing-Chou/tibaml1017/blob/main/sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [2]:
import os
import glob
import pandas as pd
def getdata(mid):
    dn = os.path.dirname(dataset)
    posfn = glob.glob(os.path.join(dn, "aclImdb", mid, "pos", "*"))
    negfn = glob.glob(os.path.join(dn, "aclImdb", mid, "neg", "*"))
    contents = []
    for fn in posfn + negfn:
        with open(fn, encoding="utf-8") as f:
            contents.append(f.read())
    df = pd.DataFrame({
        "content":contents,
        "sentiment":[1] * len(posfn) + [0] * len(negfn)
    })
    return df
train_df = getdata("train")
test_df = getdata("test")

In [3]:
test_df

Unnamed: 0,content,sentiment
0,Four stories about the drug trade in Europe be...,1
1,Lloyd Hamilton was one of the most imaginative...,1
2,SPOILER WARNING<br /><br />We've all heard the...,1
3,One of the most pleasurable aspects of movie v...,1
4,I would hope so and how can I get involved?<br...,1
...,...,...
24995,"La Ragazza del Vagone Letto, or Terror Express...",0
24996,"When I saw this in the cinema, I remember winc...",0
24997,"Why Lori Petty was cast as tank girl, I'll nev...",0
24998,No emotion. Bad music (and I am a reformed eig...,0


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

layers = [
    # 3001: 情緒化對象(3000種最常用詞匯+padding(0))
    # 100: 情緒個數(100-500)
    # input_length: 一篇文章你要看幾個詞彙(128-512)
    # 一篇文章我會蒐集512詞彙, 這512詞彙是在3000最常用單字裡, 每一個詞彙會被我化作100維度的向量
    # 係數個數: 3001(種詞彙) * 100(個情緒) = 300100
    Embedding(3001, 100, mask_zero=True, input_length=512),
    GlobalAveragePooling1D(),
    Dense(2, activation="softmax")
]
model = Sequential(layers)
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 512, 100)          300100    
                                                                 
 global_average_pooling1d_1   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_1 (Dense)             (None, 2)                 202       
                                                                 
Total params: 300,302
Trainable params: 300,302
Non-trainable params: 0
_________________________________________________________________


In [7]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy
model.compile(loss=SparseCategoricalCrossentropy(),
       optimizer="adam",
       metrics=["accuracy"])

In [10]:
# tokenize: 
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
tok.fit_on_texts(train_df["content"])
# 你可以看一下fit的結果
# tok.word_index
# tok.index_word

In [13]:
# 文章token化: sequence
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,8,1,19,523,31,104,115,365,2665.0,2888.0,...,,,,,,,,,,
1,312,1807,755,53,3,369,36,1,308.0,30.0,...,,,,,,,,,,
2,6,3,348,20,459,4,48,386,99.0,382.0,...,,,,,,,,,,
3,1230,215,96,562,3,678,15,1,11.0,19.0,...,,,,,,,,,,
4,10,432,2,216,2,171,635,42,1.0,330.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,440,1947,13,239,1,88,4,29,55.0,35.0,...,,,,,,,,,,
24996,11,6,1085,784,120,41,28,974,1.0,974.0,...,,,,,,,,,,
24997,11,17,66,3,897,672,4,706,297.0,35.0,...,,,,,,,,,,
24998,171,36,1,2592,48,123,572,5,1.0,84.0,...,,,,,,,,,,
