In [1]:
import tensorflow as tf

dataset = tf.keras.utils.get_file(
    fname="aclImdb.tar.gz", 
    origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
    extract=True,
)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [0]:
import os
import glob
import pandas as pd

def read(fp):
    with open(fp, "r", encoding="utf-8") as f:
        content = f.read()
    return content

def read_data(base):
    pos = glob.glob(os.path.join(base, "pos", "*"))
    neg = glob.glob(os.path.join(base, "neg", "*"))
    df = pd.DataFrame({
        "path":(neg + pos),
        "target":([0]*len(neg) + [1] * len(pos))
    })
    df["content"] = df["path"].apply(read)
    return df

In [17]:
dirname = os.path.dirname(dataset)
base = os.path.join(dirname, "aclImdb", "train")
train_df = read_data(base)
base = os.path.join(dirname, "aclImdb", "test")
test_df = read_data(base)
test_df

Unnamed: 0,path,target,content
0,/root/.keras/datasets/aclImdb/test/neg/9801_2.txt,0,"In the 1940s, Veronica Lake made a meteoric ri..."
1,/root/.keras/datasets/aclImdb/test/neg/6813_2.txt,0,After reading the reviews I decided to rent th...
2,/root/.keras/datasets/aclImdb/test/neg/3265_1.txt,0,This movie is so God-awful that it was literal...
3,/root/.keras/datasets/aclImdb/test/neg/7386_4.txt,0,Even though this is the first film by the brok...
4,/root/.keras/datasets/aclImdb/test/neg/3918_1.txt,0,I must admit that this is one of the worst mov...
...,...,...,...
24995,/root/.keras/datasets/aclImdb/test/pos/6384_10...,1,I saw this movie at a screener and its the bes...
24996,/root/.keras/datasets/aclImdb/test/pos/9051_8.txt,1,This 60min film shows just how much fun filmma...
24997,/root/.keras/datasets/aclImdb/test/pos/3995_9.txt,1,It is not surprising that this film was made b...
24998,/root/.keras/datasets/aclImdb/test/pos/12320_1...,1,"The last film of John Huston, the great Americ..."


In [0]:
# 預處理1. 先把文字化成數字
from tensorflow.keras.preprocessing.text import Tokenizer
# 出現太少的詞, 你可以選擇不看, 只留出現次數最高的2000(num_words)
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(train_df["content"])

In [0]:
# 想要看每個單詞被給的編號: tok.word_index

In [23]:
x_train_seq = tok.texts_to_sequences(train_df["content"])
x_test_seq = tok.texts_to_sequences(test_df["content"])
pd.DataFrame(x_train_seq)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1671,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682,1683,1684,1685,1686,1687,1688,1689,1690,1691,1692,1693,1694,1695,1696,1697,1698,1699,1700,1701,1702,1703,1704,1705,1706,1707,1708,1709,1710
0,10,329,1,664,15,1,83,55,142.0,8.0,9.0,13.0,12.0,55.0,12.0,10.0,216.0,1.0,307.0,16.0,2.0,9.0,13.0,32.0,318.0,307.0,2.0,52.0,73.0,37.0,1.0,271.0,150.0,300.0,10.0,520.0,20.0,11.0,307.0,2.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1260,66,3,49,323,1983,11,17,26.0,40.0,158.0,9.0,47.0,23.0,659.0,4.0,1.0,944.0,264.0,12.0,137.0,20.0,1433.0,2.0,239.0,5.0,1.0,593.0,617.0,55.0,79.0,30.0,11.0,392.0,47.0,23.0,61.0,238.0,1534.0,6.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,39,325,8,628,13,3,1383,10,13.0,576.0,12.0,9.0,13.0,3.0,429.0,4.0,857.0,307.0,4.0,1.0,835.0,58.0,1383.0,1.0,19.0,13.0,342.0,906.0,39.0,1.0,812.0,158.0,55.0,70.0,16.0,1.0,19.0,157.0,151.0,6.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,8,48,465,37,1,204,66,5,8.0,3.0,193.0,55.0,1537.0,1542.0,32.0,1853.0,5.0,27.0,249.0,21.0,24.0,641.0,60.0,13.0,324.0,18.0,24.0,106.0,13.0,40.0,1230.0,291.0,5.0,14.0,26.0,184.0,14.0,72.0,12.0,1.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,35,9,514,16,3,304,152,310,8.0,1.0,701.0,22.0,25.0,3.0,601.0,4.0,81.0,34.0,76.0,1800.0,5.0,213.0,5.0,11.0,310.0,2.0,21.0,1235.0,1.0,207.0,1.0,30.0,311.0,15.0,54.0,1730.0,279.0,1.0,81.0,459.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,51,11,17,35,70,1529,12,33,846.0,23.0,1741.0,7.0,7.0,259.0,168.0,408.0,9.0,13.0,240.0,4.0,913.0,5.0,64.0,11.0,17.0,54.0,8.0,3.0,1098.0,1860.0,3.0,461.0,4.0,1923.0,1410.0,3.0,1854.0,1921.0,3.0,168.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24996,721,1,318,455,362,113,1,724,450.0,1835.0,32.0,746.0,8.0,1.0,226.0,7.0,7.0,829.0,29.0,1.0,93.0,8.0,657.0,15.0,1.0,541.0,5.0,1628.0,5.0,16.0,82.0,8.0,541.0,1628.0,8.0,657.0,5.0,1479.0,620.0,1628.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24997,3,193,55,594,8,3,227,227,242.0,47.0,13.0,3.0,427.0,34.0,13.0,61.0,104.0,150.0,152.0,51.0,1.0,201.0,320.0,1648.0,19.0,13.0,623.0,26.0,149.0,374.0,83.0,316.0,1.0,17.0,18.0,26.0,79.0,149.0,374.0,110.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
24998,40,156,24,84,807,259,9,6,515.0,523.0,2.0,31.0,1452.0,6.0,365.0,4.0,1.0,1379.0,2.0,931.0,798.0,4.0,782.0,105.0,4.0,1.0,519.0,14.0,70.0,14.0,3.0,4.0,1226.0,1729.0,16.0,11.0,164.0,29.0,4.0,24.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train_pad = pad_sequences(x_train_seq, maxlen=256)
x_test_pad = pad_sequences(x_test_seq, maxlen=256)
pd.DataFrame(x_train_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,87,14,3,4,1898,1,169,268,15,9,13,37,146,104,1141,292,11,6,62,41,116,2,1794,18,10,423,64,9,8,11,307,54,142,5,1,307,3,386,55,6
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,28,151,471,37,5,132,6,12,11,17,761,37,291,868,21,49,30,1,344,30,1,127,4,3,193,972,15,1,274,761,12,52,169,93,92,171,40,103,11,15
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,19,157,151,6,12,1,411,13,181,73,75,47,13,52,114,194,273,80,1,628,307,4,11,19,2,9,61,5,1,202,509,21,3,19,10,59,178,5,64,171
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,982,546,15,65,5,1699,722,1,62,7,7,1,210,4,1,19,2,59,25,22,261,277,359,45,22,22,188,76,43,1240,205,2,195,135,35,108,81,76,8,359
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,9,18,44,126,264,15,3,92,404,199,11,19,3,714,10,13,573,682,51,10,293,11,3,52,191,384,177,58,796,34,211,52,711,185,1095,8,11,19,9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,22,76,1,577,64,9,841,22,78,89,714,9,42,3,277,8,3,581,446,113,6,84,1,812,6,526,1,62,6,603,2,576,3,208,156,18,1707,112,37,11
24996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,4,11,17,14,9,6,8,1822,1,4,110,21,61,91,1806,18,1113,1,437,4,110,29,11,31,1,970,1728,83,5,1,7,7,40,1628,9,829,29,1,93
24997,744,4,1,474,26,884,5,597,24,1589,42,147,8,11,220,86,1,233,202,819,26,884,109,1052,2,8,2,86,11,427,432,15,41,744,150,363,26,185,80,1167,...,8,1,285,1797,10,231,37,143,150,152,171,51,110,13,603,2,1,1123,436,10,66,13,43,86,10,13,167,5,1403,177,3,819,4,7,7,1239,15,1,433,3
24998,7,8,11,93,1,12,28,6,326,3,412,71,3,1589,4,2,12,338,6,32,632,1,88,974,212,5,6,37,3,728,4,16,28,1396,26,406,80,437,3,60,...,36,60,196,23,5,992,102,67,61,1086,140,338,65,6,31,1,33,1,2,12,1068,95,1,12,165,43,20,32,1086,33,67,112,98,437,30,1,127,1617,6,44
