# NLP Project, Classification of Amazon Reviews and Key Phrases
#### CSCI 3832 Natural Language Processing
Members: Adam Wuth, Benjamin Kohav, Noah Vilas, Aiden Devine, Evan Zachary

### Requirements

In [1]:
import os, random, sys, copy
import torch, torch.nn as nn, numpy as np
from tqdm.notebook import tqdm
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from nltk.tokenize import word_tokenize
from datasets import load_dataset, concatenate_datasets, load_from_disk
from datetime import datetime

### Load in the data set
The dataset is split into categories, but we wanted all categories from 2020 onwards. This code block will take forever to run, only run it the first time to get the dataset. 

In [19]:
#The dataset is split into categories

categories = [
    "All_Beauty",
    "Amazon_Fashion",
    "Appliances",
    "Arts_Crafts_and_Sewing",
    "Automotive",
    "Baby_Products",
    "Beauty_and_Personal_Care",
    "Books",
    "CDs_and_Vinyl",
    "Cell_Phones_and_Accessories",
    "Clothing_Shoes_and_Jewelry",
    "Digital_Music",
    "Electronics",
    "Gift_Cards",
    "Grocery_and_Gourmet_Food",
    "Handmade_Products",
    "Health_and_Household",
    "Health_and_Personal_Care",
    "Home_and_Kitchen",
    "Industrial_and_Scientific",
    "Kindle_Store",
    "Magazine_Subscriptions",
    "Movies_and_TV",
    "Musical_Instruments",
    "Office_Products",
    "Patio_Lawn_and_Garden",
    "Pet_Supplies",
    "Software",
    "Sports_and_Outdoors",
    "Subscription_Boxes",
    "Tools_and_Home_Improvement",
    "Toys_and_Games",
    "Video_Games",
    "Unknown"
]

#to get reviews from 2023 onwards 2020 onwards was millions of reviews and was taking
#over an hour just to load the data
start_timestamp = int(datetime(2023, 1, 1).timestamp() * 1000)

#to store all datasets
allcats = []

for cat in categories:
    print(f"Loading category: {cat}")
    dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_review_{cat}", split="full[:150000]",  trust_remote_code=True)
    #dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_review_{cat}", split="full[:1%]",  trust_remote_code=True)
   #dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", cat, split="full",  trust_remote_code=True) formatting issues
    #get the 2023 onwards and add to data
    filtered_dataset = dataset.filter(lambda x: x['timestamp'] >= start_timestamp)
    #allcats.append(dataset)
    allcats.append(filtered_dataset)
#make one final dataset    
reviews = concatenate_datasets(allcats)

print(f"Total reviews loaded: {len(reviews)}")

reviews.save_to_disk("filetred_amazon_reviews")


Loading category: All_Beauty


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Amazon_Fashion


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Appliances


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Arts_Crafts_and_Sewing


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Automotive


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Baby_Products


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Beauty_and_Personal_Care


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Books


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: CDs_and_Vinyl


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Cell_Phones_and_Accessories


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Clothing_Shoes_and_Jewelry


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Digital_Music


Filter:   0%|          | 0/130434 [00:00<?, ? examples/s]

Loading category: Electronics


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Gift_Cards


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Grocery_and_Gourmet_Food


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Handmade_Products


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Health_and_Household


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Health_and_Personal_Care


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Home_and_Kitchen


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Industrial_and_Scientific


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Kindle_Store


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Magazine_Subscriptions


Filter:   0%|          | 0/71497 [00:00<?, ? examples/s]

Loading category: Movies_and_TV


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Musical_Instruments


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Office_Products


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Patio_Lawn_and_Garden


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Pet_Supplies


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Software


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Sports_and_Outdoors


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Subscription_Boxes


Filter:   0%|          | 0/16216 [00:00<?, ? examples/s]

Loading category: Tools_and_Home_Improvement


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Toys_and_Games


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Video_Games


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Loading category: Unknown


Filter:   0%|          | 0/150000 [00:00<?, ? examples/s]

Total reviews loaded: 120088


Saving the dataset (0/1 shards):   0%|          | 0/120088 [00:00<?, ? examples/s]

If you have run that already, reviews was saved(should be in the working directory)so you can just do the next code block instead

In [21]:
reviews = load_from_disk("filetred_amazon_reviews")
print(len(reviews))
print(reviews[0])
print(reviews[1])
print(reviews.column_names)

120088
{'rating': 1.0, 'title': 'halo hair extensions', 'text': "This halo hair extension is simply put, garbage.  Now, you get what you pay for.  And this is a very cheap version.  The faux hair is very shiny and looks literally like bad barbie hair.  It looks WAY better in the photos than in real life.  The color is horrific, in my opinion of course.  The streaks are like paint strips.  And all of that would be one thing - but the worst is that the hair completely fell out!  I had hand fulls of hair strands just trying to put the halo on!  And you might think - well, maybe a little loss is to be expected?  Except this was handfuls and handfuls.  I literally dropped the whole thing right into the trash.  I would say this one is a pass for hair loss alone.  Having said all of this, I never hesitate to update my reviews should new info seem useful. All of my reviews reflect my honest, personal experience with the reviewed item - your experience may be different. I am not influenced by a

### Load in the Glove Embeddings

In [6]:
glove_file = '../glove.6B.50d.txt' # modify to appropriate path for your file system

embeddings_dict = {}

with open(glove_file, 'r', encoding='utf8') as f:
    for i, line in enumerate(f):
        line = line.strip().split(' ')
        word = line[0]
        embed = np.asarray(line[1:], "float")

        embeddings_dict[word] = embed


print('Loaded {} words from glove'.format(len(embeddings_dict)))

low = -1.0 / 3
high = 1.0 / 3
embedding_matrix = np.random.uniform(low=low, high=high, size=(len(embeddings_dict)+1, 50))

word2id = {}
for i, word in enumerate(embeddings_dict.keys(), 1):

    word2id[word] = i                                
    embedding_matrix[i] = embeddings_dict[word]      

word2id['<pad>'] = 0

Loaded 400000 words from glove


### Set up train and validation datasets

In [30]:
#modified from the HW_3 
class RNNMovieReviewDataset(torch.utils.data.Dataset):
    def __init__(self, hf_dataset=None, word2id=None, finalized_data=None, data_limit=250, max_length=128):
        """
        :param hf_dataset: A Hugging Face Dataset object (preloaded and filtered)
        :param word2id: The GloVe word2id dictionary
        :param finalized_data: Used to create validation set
        :param data_limit: Max number of examples to use
        :param max_length: Max sequence length
        """
        self.data_limit = data_limit
        self.max_length = max_length
        self.word2id = word2id

        if finalized_data:
            self.data = finalized_data
        else:
            examples = []
            labels = []

            for i, example in enumerate(hf_dataset):
                if i >= self.data_limit:
                    break
                examples.append(example["text"])
                labels.append(int(example["rating"]) - 1)  # 1–5 stars → 0–4

            tokenized = self.tokenize(examples)
            self.data = [(ids, length, label) for (ids, length), label in zip(tokenized, labels)]
            random.seed(42)
            random.shuffle(self.data)

    def tokenize(self, examples):
        example_ids = []
        misses = 0
        total = 0
        for example in tqdm(examples):
            tokens = word_tokenize(example)
            ids = []
            for tok in tokens:
                if tok in self.word2id:
                    ids.append(self.word2id[tok])
                else:
                    misses += 1
                    ids.append(self.word2id.get('unk', 0))
                total += 1

            if len(ids) >= self.max_length:
                ids = ids[:self.max_length]
                length = self.max_length
            else:
                length = len(ids)
                ids += [self.word2id['<pad>']] * (self.max_length - len(ids))

            example_ids.append((torch.tensor(ids), length))

        print(f'Missed {misses} out of {total} words -- {misses/total:.2%}')
        return example_ids

    def generate_validation_split(self, ratio=0.8):
        split_idx = int(ratio * len(self.data))
        val_split = self.data[split_idx:]
        self.data = self.data[:split_idx]
        return val_split

    def __getitem__(self, index):
        return self.data[index]  # returns (input_ids, length, label)

    def __len__(self):
        return len(self.data)


In [31]:
#also modified from hw3
train_dataset = RNNMovieReviewDataset(hf_dataset=reviews, word2id=word2id, data_limit=100000)
validation_examples = train_dataset.generate_validation_split()
print('Loaded {} train examples'.format(len(train_dataset)))

valid_dataset = RNNMovieReviewDataset(finalized_data=validation_examples, word2id=word2id)
print('Loaded {} validation examples'.format(len(valid_dataset)))

print(valid_dataset[0])  # (input_ids, length, label)

  0%|          | 0/100000 [00:00<?, ?it/s]

Missed 656201 out of 5940281 words -- 11.05%
Loaded 80000 train examples
Loaded 20000 validation examples
(tensor([201535,     33,  14034,      6,     82,    170,   4232,      4,  41522,
             5,  24505,      8,   7093,     47,   2685,     82,    304,     35,
          6175,      7,   4563,  12167,    406,  42965,    109,     40,    264,
             7,      8,  28287,   4770,     37,      8,   1931,      2,     43,
        201535,    473,     37,   1086,    198,     54,    770,  12608,      3,
        201535,   1251,     40,    268,   3083,      3,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             

In [29]:
input_ids, length, label = valid_dataset[1]
print("True length:", length)
print("Non-padded input:", input_ids[:length])
print("Label:", label)


True length: 21
Non-padded input: tensor([201535,   5179,      2,   4346,   7047,      3, 201535,   3877,   4147,
            38,    101,    182,      3, 201535,   1930,    315,   3877,     18,
            49,   5159,      3])
Label: 4
