### 作業目的: 熟練Pytorch Dataset與DataLoader進行資料讀取

本此作業主要會使用[IMDB](http://ai.stanford.edu/~amaas/data/sentiment/)資料集利用Pytorch的Dataset與DataLoader進行
客製化資料讀取。
下載後的資料有分成train與test，因為這份作業目的在讀取資料，所以我們取用train部分來進行練習。
(請同學先行至IMDB下載資料)

### 載入套件

In [1]:
# Import torch and other required modules
import glob
import torch
import re
import nltk
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import load_svmlight_file
from nltk.corpus import stopwords

nltk.download('stopwords') #下載stopwords
nltk.download('punkt') #下載word_tokenize需要的corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aband\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aband\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
nltk_stopwords = nltk.corpus.stopwords.words('english')
nltk_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

### 探索資料與資料前處理
在train資料中，有分成pos(positive)與neg(negative)，分別為正評價與負評價，此評價即為label。

In [33]:
# 讀取字典，這份字典為review內所有出現的字詞
###<your code>###
with open('./aclImdb/imdb.vocab', 'r', encoding='utf-8') as f:
    vocab = f.read()        # str
    vocab = vocab.split()   # list of words

# 以nltk stopwords移除贅字，過多的贅字無法提供有用的訊息，也可能影響模型的訓練
print(f"vocab length before removing stopwords: {len(vocab)}")
###<your code>###
for stopword in nltk_stopwords:
    try:
        while True:
            idx = vocab.index(stopword)
            vocab.pop(idx)
    except:
        continue
print(f"vocab length after removing stopwords: {len(vocab)}")
# 將字典轉換成dictionary
### <your code>###
vocab_dic = dict(zip(vocab, range(len(vocab))))

vocab length before removing stopwords: 89527
vocab length after removing stopwords: 89356


In [27]:
import os
os.listdir('./aclImdb/train/pos')

['0_9.txt',
 '10000_8.txt',
 '10001_10.txt',
 '10002_7.txt',
 '10003_8.txt',
 '10004_8.txt',
 '10005_7.txt',
 '10006_7.txt',
 '10007_7.txt',
 '10008_7.txt',
 '10009_9.txt',
 '1000_8.txt',
 '10010_7.txt',
 '10011_9.txt',
 '10012_8.txt',
 '10013_7.txt',
 '10014_8.txt',
 '10015_8.txt',
 '10016_8.txt',
 '10017_9.txt',
 '10018_8.txt',
 '10019_8.txt',
 '1001_8.txt',
 '10020_8.txt',
 '10021_8.txt',
 '10022_7.txt',
 '10023_9.txt',
 '10024_9.txt',
 '10025_9.txt',
 '10026_7.txt',
 '10027_7.txt',
 '10028_10.txt',
 '10029_10.txt',
 '1002_7.txt',
 '10030_10.txt',
 '10031_10.txt',
 '10032_10.txt',
 '10033_10.txt',
 '10034_8.txt',
 '10035_9.txt',
 '10036_8.txt',
 '10037_9.txt',
 '10038_10.txt',
 '10039_10.txt',
 '1003_10.txt',
 '10040_10.txt',
 '10041_10.txt',
 '10042_10.txt',
 '10043_10.txt',
 '10044_9.txt',
 '10045_10.txt',
 '10046_9.txt',
 '10047_10.txt',
 '10048_10.txt',
 '10049_8.txt',
 '1004_7.txt',
 '10050_10.txt',
 '10051_10.txt',
 '10052_10.txt',
 '10053_8.txt',
 '10054_10.txt',
 '10055_7.tx

In [38]:
# 將資料打包成(x, y)配對，其中x為review的檔案路徑，y為正評(1)或負評(0)
# 這裡將x以檔案路徑代表的原因是讓同學練習不一次將資料全讀取進來，若電腦記憶體夠大(所有資料檔案沒有很大)
# 可以將資料全一次讀取，可以減少在訓練時I/O時間，增加訓練速度

###<your code>###
import os
path_url_pos = './aclImdb/train/pos'
path_url_neg = './aclImdb/train/neg'
review_pairs = []

for file_name in os.listdir(path_url_pos):
    review_pairs.append((path_url_pos + '/' + file_name, 1))
for file_name in os.listdir(path_url_neg):
    review_pairs.append((path_url_neg + '/' + file_name, 0))


print(review_pairs[:2])
print(f"Total reviews: {len(review_pairs)}")

[('./aclImdb/train/pos/0_9.txt', 1), ('./aclImdb/train/pos/10000_8.txt', 1)]
Total reviews: 25000


### 建立Dataset與DataLoader讀取資料
這裡我們會需要兩個helper functions，其中一個是讀取資料與清洗資料的函式(load_review)，另外一個是生成詞向量BoW的函式
(generate_bow)

In [39]:
def load_review(review_path):
    
    ###<your code>###
    with open(review_path) as f:
        review = f.read()
    #移除non-alphabet符號、贅字與tokenize
    ###<your code>###
    review = re.sub('[^a-zA-Z]',' ', review)
    review = nltk.word_tokenize(review)
    review = list(set(review).difference(set(stopwords.words('english'))))
    
    return review

In [40]:
def generate_bow(review, vocab_dic):
    bag_vector = np.zeros(len(vocab_dic))
    for word in review:
        if vocab_dic.get(word):
            bag_vector[vocab_dic.get(word)] += 1
            
    return bag_vector

In [41]:
class dataset(Dataset):
    '''custom dataset to load reviews and labels
    Parameters
    ----------
    data_pairs: list
        directory of all review-label pairs
    vocab: list
        list of vocabularies
    '''
    def __init__(self, data_dirs, vocab):
        self.data_dirs = data_dirs
        self.vocab = vocab

    def __len__(self):
        return len(self.data_dirs)

    def __getitem__(self, idx):
        pair = self.data_dirs[idx]
        review = pair[0]
        review = load_review(review)
        review = generate_bow(review, self.vocab)
        
        return review, pair[1]

In [42]:
# 建立客製化dataset
###<your code>###
custom_dst = dataset(review_pairs, vocab_dic)


custom_dst[10]

(array([0., 1., 1., ..., 0., 0., 0.]), 1)

In [43]:
# 建立dataloader
###<your code>###
custom_dataloader = DataLoader(dataset=custom_dst, batch_size=4, shuffle=True)
next(iter(custom_dataloader))

[tensor([[0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64),
 tensor([0, 0, 0, 1])]