In [39]:
import pandas as pd
import torch
from torch import nn
from collections import defaultdict

from transformers import BertTokenizer

In [4]:
# First download the dataset from https://www.kaggle.com/datasets/asaniczka/product-titles-text-classification
TITLES_TO_CATEGORIES_CSV = './titles_to_categories.csv'

In [5]:
df = pd.read_csv(TITLES_TO_CATEGORIES_CSV)

In [6]:
df.head()

Unnamed: 0,title,category_name
0,"Green Leaf WW3D Wonder Extension Cord Winder, ...",Industrial Scientific
1,8pcs Toilet Seat Bumpers Universal Toilet Repl...,Industrial Scientific
2,YaeCCC 19 Pcs Hole Saw Kit 3/4''(19mm)- 6''(15...,Industrial Scientific
3,LLPT Butyl Putty Tape White 1 Inch x 33Ft Tigh...,Industrial Scientific
4,"Lightbeam 16"" Long Stem Deep Fry Thermometer w...",Industrial Scientific


In [None]:
# Debugging
df = df.sample(frac=0.001).reset_index(drop=True)

In [22]:
len(df)

5390

In [23]:
df['category_name'].value_counts()[:100]

Sports & Outdoors                   735
Baby                                 39
Men's Shoes                          34
Handmade Artwork                     32
Beauty                               31
                                   ... 
Home Décor Products                  12
Automotive Interior Accessories      12
Hydraulics, Pneumatics  Plumbing     12
Science Education Supplies           12
Garden Furniture & Accessories       12
Name: category_name, Length: 100, dtype: int64

In [24]:
categories = df['category_name'].unique()
categories

array(['Game Hardware', 'Sports & Outdoors', 'Surveillance Cameras',
       'Beauty', 'Luggage and travel gear', 'Televisions  Video',
       'Collectible Toys', 'Skin Care Products',
       'PlayStation 4 Games, Consoles & Accessories',
       'Computer Components', 'Projectors',
       'Xbox 360 Games, Consoles & Accessories',
       'Nursery Furniture, Bedding  Décor', 'Indoor Lighting', 'Men',
       'Arts & Crafts', 'Bath  Body', 'Beanbag  Foot Bags',
       "Children's Outdoor Inflatable Bouncers  Bouncy Castles",
       'Lights, Bulbs & Indicators', 'Baby Travel Gear',
       'Small Animal Supplies', "Kids' Electronics",
       'Play Sets  Playground Equipment', 'Computer Monitors',
       'Plants, Seeds & Bulbs', 'Material Handling Products',
       'Coffee, Tea & Espresso', 'Home  Portable Audio',
       'Retail Store Fixtures & Equipment',
       'Garden Furniture & Accessories', 'Puzzles',
       'Science Education Supplies',
       'Uninterruptible Power Supply Units & Acce

In [18]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


Downloading tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 16.2kB/s]
Downloading vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.30MB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.95MB/s]
Downloading config.json: 100%|██████████| 570/570 [00:00<00:00, 1.41MB/s]


In [19]:
tokenizer.tokenize("I have a new GPU!")

['i', 'have', 'a', 'new', 'gp', '##u', '!']

In [25]:
df["tokenized_title"] = df["title"].apply(lambda title: tokenizer.tokenize(title))

In [26]:
df["category_label"] = df["category_name"].apply(lambda category: categories.tolist().index(category))

In [29]:
df[df["category_name"] == "Game Hardware"].head()

Unnamed: 0,title,category_name,tokenized_title,category_label
0,"Gaming Mouse, DPI 6 Speed Adjustable 13 RGB Li...",Game Hardware,"[gaming, mouse, ,, d, ##pi, 6, speed, adjustab...",0
248,CORSAIR K65 RGB Mini 60% Mechanical Gaming Key...,Game Hardware,"[co, ##rsa, ##ir, k, ##65, r, ##gb, mini, 60, ...",0
1403,Mouse Anti-Slip Grips Tape Paste for Logitech ...,Game Hardware,"[mouse, anti, -, slip, grips, tape, paste, for...",0
1633,Wallfire Gaming Headset Stereo Surround Sound ...,Game Hardware,"[wall, ##fire, gaming, heads, ##et, stereo, su...",0
3201,"Rechargeable Backlit Keyboard,2.4G Wireless Il...",Game Hardware,"[rec, ##har, ##ge, ##able, back, ##lit, keyboa...",0


In [31]:
# split the dataset into train and test
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

In [41]:
vocab_counts = defaultdict(lambda: 0)
for tokenized_title in df["tokenized_title"]:
    for token in tokenized_title:
        vocab_counts[token] += 1

In [44]:
tokens_by_count = sorted([(v, i) for v, i in vocab_counts.items()], key=lambda x: x[1], reverse=True)
vocab = {token: i for i, (token, _) in enumerate(tokens_by_count)}

In [45]:
vocab

{',': 0,
 '-': 1,
 'for': 2,
 '##s': 3,
 'with': 4,
 '(': 5,
 ')': 6,
 '/': 7,
 '.': 8,
 'and': 9,
 '2': 10,
 '##er': 11,
 's': 12,
 'women': 13,
 "'": 14,
 '3': 15,
 '1': 16,
 '##x': 17,
 '##r': 18,
 '4': 19,
 'men': 20,
 '5': 21,
 '##0': 22,
 'black': 23,
 '&': 24,
 'pack': 25,
 'x': 26,
 '"': 27,
 '##d': 28,
 '6': 29,
 '8': 30,
 'kids': 31,
 '##e': 32,
 'set': 33,
 'inch': 34,
 'to': 35,
 'water': 36,
 '##pc': 37,
 '##t': 38,
 'in': 39,
 '##mm': 40,
 'bag': 41,
 '##able': 42,
 '|': 43,
 '##v': 44,
 '10': 45,
 'of': 46,
 '##y': 47,
 '##g': 48,
 ':': 49,
 'outdoor': 50,
 't': 51,
 'white': 52,
 '##2': 53,
 '##k': 54,
 '12': 55,
 'blue': 56,
 '##l': 57,
 '##5': 58,
 'head': 59,
 'usb': 60,
 '##proof': 61,
 '##w': 62,
 '##m': 63,
 '##ing': 64,
 'size': 65,
 'home': 66,
 'light': 67,
 'a': 68,
 '##p': 69,
 '##a': 70,
 'girls': 71,
 '##7': 72,
 '##3': 73,
 '+': 74,
 '0': 75,
 'gift': 76,
 'cover': 77,
 'car': 78,
 '##gb': 79,
 '##6': 80,
 'the': 81,
 '##c': 82,
 '##n': 83,
 'case': 84,
 '