In [1]:
import tarfile
import os
if not os.path.exists("imdb"):
    tar = tarfile.open("aclImdb_v1.tar.gz")
    tar.extractall("imdb")
    tar.close()

In [2]:
import glob
import pandas as pd

base = "imdb/aclImdb/train/"

postxt = []
pos = glob.glob(base + "pos/*.txt")
for p in pos:
    with open(p, "r", encoding="utf-8") as f:
        postxt.append(f.read())

negtxt = []
neg = glob.glob(base + "neg/*.txt")
for n in neg:
    with open(n, "r", encoding="utf-8") as f:
        negtxt.append(f.read())
        
train = pd.DataFrame({
    "content":postxt + negtxt,
    "sentiment":[1] * len(postxt) + [0] * len(negtxt)
})
train

Unnamed: 0,content,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
...,...,...
24995,"Towards the end of the movie, I felt it was to...",0
24996,This is the kind of movie that my enemies cont...,0
24997,I saw 'Descent' last night at the Stockholm Fi...,0
24998,Some films that you pick up for a pound turn o...,0


In [3]:

base = "imdb/aclImdb/test/"

postxt = []
pos = glob.glob(base + "pos/*.txt")
for p in pos:
    with open(p, "r", encoding="utf-8") as f:
        postxt.append(f.read())

negtxt = []
neg = glob.glob(base + "neg/*.txt")
for n in neg:
    with open(n, "r", encoding="utf-8") as f:
        negtxt.append(f.read())
        
test = pd.DataFrame({
    "content":postxt + negtxt,
    "sentiment":[1] * len(postxt) + [0] * len(negtxt)
})
test

Unnamed: 0,content,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
...,...,...
24995,I occasionally let my kids watch this garbage ...,0
24996,When all we have anymore is pretty much realit...,0
24997,The basic genre is a thriller intercut with an...,0
24998,Four things intrigued me as to this film - fir...,0


In [6]:
# Step1. 幫你列出所有出現過的詞
from keras.preprocessing.text import Tokenizer
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(train["content"])

In [7]:
# 如果你要看詞轉化的數字
# tok.word_index
# 根據出現次數做一個排序: 
# sorted(tok.word_counts.items(), key=lambda x:x[1], reverse=True)

In [8]:
# Step2. 根據剛才統計的辭典, 把每一個詞化成一個數字
# !!! 2000以外的詞都丟掉了
x_train_num = tok.texts_to_sequences(train["content"])
x_test_num = tok.texts_to_sequences(test["content"])
pd.DataFrame(x_train_num)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1701,1702,1703,1704,1705,1706,1707,1708,1709,1710
0,309,6,3,1069,209,9,30,1,169.0,55.0,...,,,,,,,,,,
1,39,14,739,44,74,32,1829,15,150.0,18.0,...,,,,,,,,,,
2,526,117,113,31,1957,115,902,758,10.0,25.0,...,,,,,,,,,,
3,11,6,711,1,88,19,1,249,91.0,9.0,...,,,,,,,,,,
4,11,6,21,1,797,19,9,13,73.0,326.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,946,1,127,4,1,17,10,417,9.0,13.0,...,,,,,,,,,,
24996,11,6,1,240,4,17,12,58,1496.0,10.0,...,,,,,,,,,,
24997,10,216,233,311,30,1,19,1411,2.0,9.0,...,,,,,,,,,,
24998,46,105,12,22,1259,53,15,3,468.0,43.0,...,,,,,,,,,,


In [12]:
# (optional) 你可以查查看每一個數字代表什麼詞
reverse_index = {v:k for k, v in tok.word_index.items()}
reverse_index[6]

'is'

In [14]:
# Step3. 把每一篇文章截長補短變成一樣多的詞數
# 截長: 後面截的
# 補短: 補0
from keras.preprocessing.sequence import pad_sequences
x_train_num_pad = pad_sequences(x_train_num, maxlen=200)
x_test_num_pad = pad_sequences(x_test_num, maxlen=200)
pd.DataFrame(x_train_num_pad)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0,0,0,0,0,0,0,0,0,0,...,101,12,309,6,227,48,3,12,9,215
1,1,1983,15,501,206,1,45,26,67,78,...,7,39,276,11,19,77,22,5,335,405
2,0,0,0,0,0,0,0,0,0,0,...,125,254,55,10,64,9,60,6,176,396
3,0,0,0,0,0,0,0,0,0,0,...,3,164,2,561,21,35,73,14,3,482
4,0,0,0,0,0,0,0,0,0,0,...,8,1,17,18,2,196,253,65,528,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,156,18,10,112,194,41,9,769,3,5,...,845,18,9,13,250,5,103,3,411,76
24996,0,0,0,0,0,0,0,0,0,0,...,302,103,3,539,507,41,1568,37,4,1
24997,806,11,17,43,58,1172,900,1227,5,1,...,54,10,40,423,76,80,11,17,96,75
24998,0,46,105,12,22,1259,53,15,3,468,...,18,1138,278,2,131,105,8,260,1195,794
