In [None]:
import os
import re
from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import  Dataset, DataLoader
from torchvision.transforms import ToTensor
from torchmetrics import ConfusionMatrix
from mlxtend.plotting import plot_confusion_matrix

## Reading data

In [18]:
def load_imdb_data(data_dir):
    data = {'review': [], 'sentiment': [], 'split': []}
    for split in ['train', 'test']:
        for sentiment in ['pos', 'neg']:
            path = os.path.join(data_dir, split, sentiment)
            for fname in os.listdir(path):
                if fname.endswith(".txt"):
                    with open(os.path.join(path, fname), encoding='utf-8') as f:
                        data['review'].append(f.read())
                        data['sentiment'].append(1 if sentiment == 'pos' else 0)
                        data['split'].append(split)
    return pd.DataFrame(data)

# Read all data into dataframe
imdb_df = load_imdb_data('aclImdb')

# Save to csv for future usage
# imdb_df.to_csv("imdb_raws.csv", index=False)

imdb_df

Unnamed: 0,review,sentiment,split
0,Bromwell High is a cartoon comedy. It ran at t...,1,train
1,Homelessness (or Houselessness as George Carli...,1,train
2,Brilliant over-acting by Lesley Ann Warren. Be...,1,train
3,This is easily the most underrated film inn th...,1,train
4,This is not the typical Mel Brooks film. It wa...,1,train
...,...,...,...
49995,I occasionally let my kids watch this garbage ...,0,test
49996,When all we have anymore is pretty much realit...,0,test
49997,The basic genre is a thriller intercut with an...,0,test
49998,Four things intrigued me as to this film - fir...,0,test


In [32]:
imdb_df = pd.read_csv('imdb_raws.csv')
imdb_df

Unnamed: 0,review,sentiment,split
0,Bromwell High is a cartoon comedy. It ran at t...,1,train
1,Homelessness (or Houselessness as George Carli...,1,train
2,Brilliant over-acting by Lesley Ann Warren. Be...,1,train
3,This is easily the most underrated film inn th...,1,train
4,This is not the typical Mel Brooks film. It wa...,1,train
...,...,...,...
49995,I occasionally let my kids watch this garbage ...,0,test
49996,When all we have anymore is pretty much realit...,0,test
49997,The basic genre is a thriller intercut with an...,0,test
49998,Four things intrigued me as to this film - fir...,0,test


In [33]:
imdb_df_train = imdb_df[imdb_df['split'] == 'train'].copy()
imdb_df_test = imdb_df[imdb_df['split'] == 'test'].copy()
len(imdb_df_train), len(imdb_df_test)

(25000, 25000)

## Data Preprocessing

In [44]:
stop_words = set(stopwords.words('english'))

def clean_and_tokenize(text, remove_stopwords=False):
  # Replace HTML tags
  text = text.replace('<br />', ' ').replace('<br></br>', ' ')
    
  # Lowercase
  text = text.lower()
    
  # Remove non-alphabetic characters
  text = re.sub(r"[^a-zA-Z]", ' ', text)
    
  # Tokenize
  tokens = word_tokenize(text)
    
  # Remove stopwords
  if remove_stopwords:
    tokens = [t for t in tokens if t not in stop_words]
    
  return tokens

In [45]:
# Apply to DataFrame to add new column
imdb_df_train['tokens'] = imdb_df_train['review'].apply(lambda x: clean_and_tokenize(x, remove_stopwords=True))
imdb_df_train

Unnamed: 0,review,sentiment,split,tokens
0,Bromwell High is a cartoon comedy. It ran at t...,1,train,"[bromwell, high, cartoon, comedy, ran, time, p..."
1,Homelessness (or Houselessness as George Carli...,1,train,"[homelessness, houselessness, george, carlin, ..."
2,Brilliant over-acting by Lesley Ann Warren. Be...,1,train,"[brilliant, acting, lesley, ann, warren, best,..."
3,This is easily the most underrated film inn th...,1,train,"[easily, underrated, film, inn, brooks, cannon..."
4,This is not the typical Mel Brooks film. It wa...,1,train,"[typical, mel, brooks, film, much, less, slaps..."
...,...,...,...,...
24995,"Towards the end of the movie, I felt it was to...",0,train,"[towards, end, movie, felt, technical, felt, l..."
24996,This is the kind of movie that my enemies cont...,0,train,"[kind, movie, enemies, content, watch, time, b..."
24997,I saw 'Descent' last night at the Stockholm Fi...,0,train,"[saw, descent, last, night, stockholm, film, f..."
24998,Some films that you pick up for a pound turn o...,0,train,"[films, pick, pound, turn, rather, good, rd, c..."


In [46]:
# Apply to DataFrame to add new column
imdb_df_test['tokens'] = imdb_df_test['review'].apply(lambda x: clean_and_tokenize(x, remove_stopwords=True))
imdb_df_test

Unnamed: 0,review,sentiment,split,tokens
25000,I went and saw this movie last night after bei...,1,test,"[went, saw, movie, last, night, coaxed, friend..."
25001,Actor turned director Bill Paxton follows up h...,1,test,"[actor, turned, director, bill, paxton, follow..."
25002,As a recreational golfer with some knowledge o...,1,test,"[recreational, golfer, knowledge, sport, histo..."
25003,"I saw this film in a sneak preview, and it is ...",1,test,"[saw, film, sneak, preview, delightful, cinema..."
25004,Bill Paxton has taken the true story of the 19...,1,test,"[bill, paxton, taken, true, story, us, golf, o..."
...,...,...,...,...
49995,I occasionally let my kids watch this garbage ...,0,test,"[occasionally, let, kids, watch, garbage, unde..."
49996,When all we have anymore is pretty much realit...,0,test,"[anymore, pretty, much, reality, tv, shows, pe..."
49997,The basic genre is a thriller intercut with an...,0,test,"[basic, genre, thriller, intercut, uncomfortab..."
49998,Four things intrigued me as to this film - fir...,0,test,"[four, things, intrigued, film, firstly, stars..."


## Vocabulary

In [42]:
class Vocabulary:
  def __init__(self, token_to_index={}, pad_token='<pad>', unk_token='<unk>'):
    self.token_to_index = token_to_index
    self.index_to_token = {index: token for token, index in self.token_to_index.items()}

    self.pad_token = pad_token
    self.pad_index = -1 if len(self.token_to_index) == 0 else self.token_to_index[self.pad_token]

    self.unk_token = unk_token
    self.unk_index = -2 if len(self.token_to_index) == 0 else self.token_to_index[self.unk_token]
  
  def __len__(self):
    return len(self.token_to_index)

  def add_token(self, token):
    if token in self.token_to_index:
      index = self.token_to_index[token]
    else:
      index = len(self.token_to_index)
      self.token_to_index[token] = index
      self.index_to_token[index] = token
    return index

  def lookup_token(self, token):
    return self.token_to_index.get(token, self.unk_index)
  
  def lookup_index(self, index):
    if index not in self.index_to_token:
      raise KeyError(f"the index {index} is not in the Vocabulary")
    return self.index_to_token[index]

  # Build vocab from tokenized data filtered by minimun frequency of each word
  @classmethod
  def build_vocab_with_min_freq(cls, token_lists, min_freq=25):
    counter = Counter()
    for tokens in token_lists:
      counter.update(tokens)
    
    # '<pad>' is for padding review to have the same length
    # '<unk>' is for unknown word that are removed or not in train data
    vocab = {'<pad>': 0, '<unk>': 1}
    
    for word, freq in counter.items():
      if freq >= min_freq:
        vocab[word] = len(vocab)
    
    return cls(vocab)

  # Build vocab from tokenized data filtered by maximun words in vocab
  @classmethod
  def build_vocab_with_max_words(cls, token_lists, max_words=8000):
    counter = Counter()
    for tokens in token_lists:
      counter.update(tokens)
    
    # '<pad>' is for padding review to have the same length
    # '<unk>' is for unknown word that are removed or not in train data
    vocab = {'<pad>': 0, '<unk>': 1}
  
    most_common_words = counter.most_common(max_words - 2)
    for word, _ in most_common_words:
      vocab[word] = len(vocab)

    return cls(vocab)

In [None]:
# Build vocabulary from train data
review_vocab = Vocabulary.build_vocab_with_max_words(imdb_df_train['tokens'])
len(review_vocab)

8000