# Assignment 3 - Part I

In this part, we are to perform preprocessing on text data.

This involves you to:

1. Complete the implementation of `helper.py`.
2. Use the unit tests below to verify the functional correctness of your implementation.

In [19]:
"🔒"
import torch
import pandas as pd
import helper
from importlib import reload
import sys, os
import numpy as np

# Load the dataset into DataFrame

In [20]:
"🔒"
import gzip
with gzip.open(f"./IMDB_Review/labels.txt.gz", 'r') as f:
    lines = [x.decode('utf8').strip() for x in f.readlines()]
    labels = pd.Series(lines)
    
with gzip.open(f'./IMDB_Review/reviews.txt.gz', 'r') as f:
    lines = [x.decode('utf8').strip() for x in f.readlines()]
    reviews = pd.Series(lines)
    
data_df = pd.DataFrame({"review": reviews, "label": labels})
data_df.head()

Unnamed: 0,review,label
0,bromwell high is a cartoon comedy . it ran at ...,positive
1,story of a man who has unnatural feelings for ...,negative
2,homelessness or houselessness as george carli...,positive
3,airport starts as a brand new luxury pla...,negative
4,brilliant over acting by lesley ann warren . ...,positive


## Tokenzer

In [21]:
"🔒"
# @check
# @title: check tokenizer

reload(helper)
helper.tokenize("CSCI 4050U is an introduction to machine learning.")

['csci', '4050u', 'is', 'an', 'introduction', 'to', 'machine', 'learning', '.']

## Iterate over reviews

In [22]:
"🔒"
# @check
# @title: check token iterator

reload(helper)
iterator = helper.iter_review_tokens(data_df)
print(next(iterator)[:10])
print(next(iterator)[:10])

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at']
['story', 'of', 'a', 'man', 'who', 'has', 'unnatural', 'feelings', 'for', 'a']


## Build a vocabulary

In [23]:
"🔒"
# @check
# @title: load the vocabulary

reload(helper)
vocab = helper.get_vocabulary(data_df, max_tokens=2000)
vocab.set_default_index(vocab['<unk>'])
vocab

Vocab()

In [24]:
"🔒"
# @check
# @title: check vocabulary length

len(vocab)

2000

In [25]:
"🔒"
# @check
# @title: verify the first ten tokens in the vocabulary

vocab.lookup_tokens(range(10))

['<pad>', '<unk>', '<start>', 'the', '.', 'and', 'a', 'of', 'to', 'is']

In [26]:
"🔒"
# @check
# @title: verify the encodings by the vocabulary

vocab.lookup_indices(['hello', 'world', 'good', 'movie', 'blah'])

[1, 179, 53, 21, 1]

## Build dataset

In [27]:
"🔒"
# convert data frame to a PyTorch dataset

reload(helper)
dataset = helper.get_review_dataset(data_df, vocab, max_length=200)
type(dataset)

torch.utils.data.dataset.TensorDataset

In [28]:
"🔒"
# @check
# @title: check dataset

dataset[0]

(tensor([   1,  311,    9,    6, 1053,  210,    4,   11,    1,   35,    3,  174,
           60,   18,   52,   84,    1,   47,  385,  113,  143,   18,    1,    4,
           63,  157,   12,    3,    1,    1,  478,   74,    8,  263,   15,    1,
          311,   16, 1985,    9,   77,    1,    8,  616,   76,    9,    1,    4,
            3,    1,    8, 1994,    1,    3,    1, 1507,   39,   54,   69,  207,
          148,   70, 1205,    1,    1,    3,    1,    7,    3,  224,  886,   34,
            1,   74,    7,    3,    1,   13,  690,    5,   70, 1507,    4,   57,
           13,  219,    3,  386,   12,   65,    6, 1411,    1,  787,    8,    1,
          183,    3,  385,   13, 1215,    1,    4,    4,    4,    4,    4,    4,
            4,    4,    4,   35,    4,    4,    4,    4,    4,    4,    4,    4,
            4,    4,  311,    4,    6,  352,  344,    1,   13,  146,  130,    8,
            1,   33,    7,  132,    1,    4, 1411,    1,    8,    1,  311,    4,
           13,  531,   15,  

## Save dataset to file

In [29]:
"🔒"
#
# saving the data file
#
torch.save(dataset, './dataset.npz')

In [30]:
"🔒"
# @check
# @title: check npz file size.

size = round(os.lstat('./dataset.npz').st_size / (2 ** 20))
print(f"File size = {size} MB")

File size = 38 MB


In [31]:
dataset = torch.load('./dataset.npz')

In [32]:
print("Training dataset: %d" % len(dataset))

Training dataset: 25000


In [33]:
# create a train test split. the test set will have 20% of the data
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [int(len(dataset) * 0.8), int(len(dataset) * 0.2)])

In [34]:
print("Training dataset: %d" % len(train_dataset))
print("Test dataset: %d" % len(test_dataset))

Training dataset: 20000
Test dataset: 5000


In [35]:
torch.save(train_dataset, './train_dataset_extended.npz')
torch.save(test_dataset, './test_dataset_extended.npz')