In [1]:
# install nltk (natural language toolkit) if necessary
# !pip install nltk

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torchtext
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import unicodedata
import re
import string
import nltk

from torch.utils.data import Dataset, DataLoader

from tqdm.autonotebook import tqdm

from idlmam import set_seed

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()

import pandas as pd

from sklearn.metrics import accuracy_score

import time

from idlmam import LastTimeStep, train_network, Flatten, weight_reset, View, LambdaLayer
from idlmam import AttentionAvg, GeneralScore, DotScore, AdditiveAttentionScore #For attention mechanism use

  from tqdm.autonotebook import tqdm


In [3]:
torch.backends.cudnn.deterministic=True
set_seed(42)

In [4]:
def get_pytorch_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    elif torch.backends.mps.is_available():
        return torch.device('mps')  # MPS is available on Apple Silicon Macs
    else:
        return torch.device('cpu')

device = get_pytorch_device()

# I found the mps gpu to be slower than the CPU, so uncomment the next line if you want to force the CPU
device = torch.device('cpu')

In [5]:
train_iter, test_iter = IMDB(split=('train', 'test')) # Note change from book
train_dataset = list(train_iter)
test_dataset = list(test_iter)

## Preprocess Text and Build Vocabulary

Some everyday things to do with text data:
* remove stop words, such as 'a', 'the', etc. that don't usually change the meaning of text.
* convert unicode characters to ascii
* remove html tags
* convert all letters to lower case
* maybe remove punctuation (I decided not to do so)

In the following few cells we'll show how this can be done.

First we'll download a commonly used list of English stopwords from NLTK:

In [6]:
# Download NLTK stop words
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) # we're going to be lazy and use this globally

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jbaggett/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Now we'll define two functions to convert unicode to ascii and to preprocess the text:

In [7]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn' and (c in string.ascii_letters or c == ' ')
    )

def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Normalize Unicode to ASCII
    text = unicode_to_ascii(text.lower())
    # Remove all characters that are not ASCII letters, spaces or punctuation
    text = re.sub(r'[^a-z .,?!]+', ' ', text)
    #text = re.sub(r'[^a-z]+', ' ', text)
    # Tokenize text
    #tokens = tokenizer(text)
    tokens = text.split()
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

Here is the first review in the dataset:

In [8]:
first_review = train_dataset[0][1]
print(f'The number of characters in the first review is {len(first_review)} \n')
print(f'Here is the first review: \n\n {first_review}')

The number of characters in the first review is 1640 

Here is the first review: 

 I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was con

In [9]:
from torchtext.data.utils import get_tokenizer#tokenizers break strings like "this is a string" into lists of tokens like ['this', 'is', 'a', 'string']
tokenizer = get_tokenizer('basic_english') #we will be fine with the default english style tokenizer
first_review_preproc = preprocess_text(first_review)
first_review_preproc =' '.join(first_review_preproc)
print(f'The number of characters in the first review is {len(first_review_preproc)} \n')
print(f'Here is the first preprocessed review: \n\n {first_review_preproc}')

The number of characters in the first review is 1039 

Here is the first preprocessed review: 

 rented curiousyellow video store controversy surrounded first released also heard first seized us customs ever tried enter country therefore fan films considered controversial really see plot centered around young swedish drama student named lena wants learn everything life particular wants focus attentions making sort documentary average swede thought certain political issues vietnam war race issues united states asking politicians ordinary denizens stockholm opinions politics sex drama teacher classmates married men kills curiousyellow years ago considered pornographic really sex nudity scenes far even shot like cheaply made porno countrymen mind find shocking reality sex nudity major staple swedish cinema even ingmar bergman arguably answer good old boy john ford sex scenes films commend filmmakers fact sex shown film shown artistic purposes rather shock people make money shown pornograp

In [10]:
# build the vocabulary with or without pre-processing
pre_proc = True

from collections import Counter #how many lines in this dataset
from torchtext.vocab import vocab #we need to create a vocabulary of all the words in the training set
from torchtext.data.utils import get_tokenizer#tokenizers break strings like "this is a string" into lists of tokens like ['this', 'is', 'a', 'string']
tokenizer = get_tokenizer('basic_english') #we will be fine with the default english style tokenizer

if pre_proc:
    tokenizer = preprocess_text # set the tokenizer to be our function that adds additional processing
    
counter = Counter() 
for (label, line) in train_dataset: #loop through the training data 
    counter.update(tokenizer(line)) #count the number of unique tokens we see and how often we see them (e.g., we will see "the" a lot, but "sasquatch" mayb

vocab = vocab(counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>')) #create a vocab object, removing any word that didn't occur at least 10 times, and add special vocab items for unkown, begining of sentance, end of sentance, and "padding"

### NEW - 2024
vocab.set_default_index(vocab['<unk>'])

In [11]:
def text_transform(x): #string -> list of integers
    return [vocab['<BOS>']] + [vocab[token] for token in tokenizer(x)] + [vocab['<EOS>']] #vocab acts like a dictionary, handles unkown tokens for us, and we can make it pre and post-pend with the start and end markers respectively.

def label_transform(x): 
    return x-1 #labes are originally [1, 2, 3, 4] but we need them as [0, 1, 2, 3] 

#Transform the first data point's text into a list of tokens
print(text_transform(train_dataset[0][1])) 

[1, 4, 0, 5, 6, 7, 8, 9, 10, 11, 12, 9, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 36, 41, 42, 43, 44, 45, 46, 0, 47, 48, 49, 50, 51, 52, 53, 50, 54, 55, 56, 57, 58, 59, 0, 60, 61, 62, 32, 63, 64, 65, 66, 67, 0, 68, 69, 23, 70, 25, 62, 71, 72, 73, 74, 75, 76, 77, 78, 79, 0, 80, 81, 82, 83, 62, 71, 84, 85, 31, 86, 74, 87, 88, 89, 90, 91, 92, 93, 94, 95, 62, 72, 22, 96, 97, 98, 62, 99, 100, 99, 101, 102, 103, 104, 105, 106, 107, 99, 70, 108, 109, 0, 91, 100, 110, 111, 112, 113, 114, 115, 116, 31, 86, 25, 100, 117, 118, 27, 2]


In [12]:
VOCAB_SIZE = len(vocab)
NUM_CLASS = len(np.unique([z[0] for z in train_dataset])) 
print("Vocab: ", VOCAB_SIZE)
print("Num Classes: ", NUM_CLASS)

padding_idx = vocab["<PAD>"]

embed_dim = 128
B = 64
epochs = 5

Vocab:  20175
Num Classes:  2
