In [0]:
%tensorflow_version 2.x

In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [0]:
import os
import glob
import pandas as pd

def get_data(n):
    train_dn = os.path.join(dn, n)
    contents = []
    sentiment = []
    pos_fn = os.path.join(train_dn, "pos", "*.txt")
    for fn in glob.glob(pos_fn):
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            sentiment.append(1)

    neg_fn = os.path.join(train_dn, "neg", "*.txt")
    for fn in glob.glob(neg_fn):
        with open(fn, "r", encoding="utf-8") as f:
            contents.append(f.read())
            sentiment.append(0)

    df = pd.DataFrame({
        "content":contents,
        "sentiment":sentiment
    }, columns=["content", "sentiment"])
    return df

In [16]:
dn = os.path.dirname(dataset)
dn = os.path.join(dn, "aclImdb")

train_df = get_data("train")
test_df = get_data("test")
test_df

Unnamed: 0,content,sentiment
0,Fair and nifty little science fiction/horror f...,1
1,I think Gerard's comments on the doc hit the n...,1
2,This is one of the best Czech movies I have ev...,1
3,"This film, for what it was set out to be, succ...",1
4,"Excellent performance by Mary KAy Place, Steve...",1
...,...,...
24995,First I have to say that I have read everythin...,0
24996,"""The Duke"" is a film based in the heart of the...",0
24997,So we're supposed to find it funny that this w...,0
24998,This show had a promising start as sort of the...,0


In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=3000)
# 這步像是以前的CountVectorizer的fit(找出多少種)
tok.fit_on_texts(train_df["content"])

In [0]:
# 這是我好習慣, 先把正向轉換和反向轉換準備好
index_2_word = tok.index_word
word_2_index = {v:k for k, v in tok.index_word.items()}

In [26]:
# 真的轉換成數字, transform
# 這些數字都會在(3000)的精選單詞內
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1776,1777,1778,1779,1780,1781,1782,1783,1784,1785,1786,1787,1788,1789,1790,1791,1792,1793,1794,1795,1796,1797,1798,1799,1800,1801,1802,1803,1804,1805,1806,1807,1808,1809,1810,1811,1812,1813,1814,1815
0,1,6,3,84,17,10,216,9,337.0,8.0,28.0,311.0,41.0,1.0,2419.0,8.0,1.0,519.0,258.0,1.0,422.0,59.0,1461.0,894.0,1.0,83.0,454.0,150.0,4.0,1.0,23.0,21.0,63.0,671.0,8.0,11.0,19.0,7.0,7.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,32,155,1,646,4,484,2,113,8.0,11.0,2425.0,17.0,6.0,2767.0,4.0,1.0,52.0,115.0,4.0,151.0,359.0,1280.0,2.0,23.0,52.0,49.0,292.0,344.0,15.0,344.0,2.0,190.0,15.0,190.0,9.0,6.0,28.0,4.0,1.0,115.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,11,6,3,726,209,118,2052,253,1664.0,31.0,2338.0,2.0,24.0,294.0,1011.0,5.0,24.0,2373.0,2.0,3.0,1016.0,1751.0,1.0,790.0,131.0,675.0,552.0,6.0,31.0,3.0,1301.0,693.0,174.0,4.0,70.0,106.0,153.0,11.0,17.0,6.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,764,13,128,157,675,320,34,90,1.0,36.0,99.0,5.0,695.0,16.0,7.0,7.0,24.0,284.0,431.0,835.0,3.0,2125.0,5.0,1.0,295.0,4.0,150.0,142.0,7.0,7.0,26.0,59.0,891.0,1.0,120.0,16.0,32.0,633.0,72.0,59.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,90,8,2706,6,32,318,17,12,86.0,104.0,25.0,73.0,8.0,1138.0,18.0,23.0,21.0,57.0,1883.0,4.0,1.0,2387.0,254.0,921.0,44.0,20.0,28.0,157.0,624.0,906.0,126.0,520.0,2.0,2070.0,22.0,5.0,64.0,86.0,1.0,2997.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,803,516,9,3,297,60,6,1,672.0,10.0,199.0,5.0,99.0,20.0,60.0,10.0,1132.0,43.0,39.0,806.0,2353.0,8.0,11.0,418.0,10.0,1578.0,2353.0,155.0,232.0,36.0,1.0,127.0,63.0,63.0,1095.0,2.0,21.0,2914.0,30.0,29.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24996,58,365,2,10,1608,12,17,233,311.0,2.0,72.0,66.0,28.0,4.0,1.0,830.0,916.0,8.0,1.0,17.0,6.0,21.0,420.0,5.0,27.0,160.0,30.0,29.0,18.0,9.0,6.0,40.0,35.0,643.0,2.0,9.0,1503.0,98.0,1878.0,1688.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24997,10,553,35,73,41,11,17,86,9.0,13.0,3.0,84.0,1176.0,2.0,28.0,4.0,145.0,399.0,1353.0,99.0,12.0,781.0,251.0,448.0,4.0,88.0,1176.0,99.0,40.0,66.0,5.0,64.0,70.0,10.0,1608.0,9.0,2.0,10.0,25.0,5.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24998,1165,908,55,651,841,22,680,9,328.0,149.0,566.0,1.0,948.0,276.0,91.0,40.0,69.0,276.0,10.0,40.0,89.0,37.0,2113.0,1287.0,14.0,10.0,194.0,731.0,2.0,79.0,2065.0,42.0,239.0,50.0,1911.0,2450.0,71.0,29.0,43.0,209.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
