#How To Build a Model: Step By Step


In [34]:
#Pytorch Imports
%load_ext autoreload
%autoreload 2

import torch
from torch import nn

from torch.utils.data import DataLoader
from torchtext.data import get_tokenizer
from torchtext.datasets import IMDB
from torchtext.transforms import ToTensor
from torchtext.vocab import build_vocab_from_iterator
#Pandas
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split
import time

from NNDL.transformer.architecture import TransformerModel
from NNDL.RNN.architecture import RNN,MyNetwork
import NNDL.Utils.solver as solver
from NNDL.Utils.weight_tracker import ActivationMonitor


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from transformers import AutoTokenizer

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

<contextlib.ExitStack at 0x2349cabb410>

Pick Your Universal Hyperparamaters:

In [36]:
batch_size = 32
epochs = 5
input_dim = 100
hidden_dim = 50
output_dim = 10
n_filters=30
filter_size=5
drop_frac=0.5
embed_dim = 50
train_size = 0.7
test_size = 0.1
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
torch.cuda.amp.autocast()

print(f"==>> torch.cuda.is_available(): {torch.cuda.is_available()}")
print(device)
#device = "cpu"

==>> torch.cuda.is_available(): True
cuda


# Setup Your Model Workspace:

Your model architecture, and all of it's relevent code, will go in a folder at the location ./NNDL/(Your Model Name Here)


# Setup Your Dataset/Data Loaders Here

Pytorch usually takes a csv file. You need to write a function( For naming convention let's call it *create_torch_datasets()* ) that takes the universal dataset kyle got from parsing and seperates/loads it into data that you want your model to train on




In [37]:
#Create Custom Dataset Class:
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe
from transformers import BertTokenizer
# Load the pre-trained BERT tokenizer
tokenizer = get_tokenizer("basic_english")

# Input text
text = "Hello, how are you doing?"

vec = GloVe(name='6B', dim=embed_dim)
class TextDataset(Dataset):
    def __init__(self, data, root_dir):
        self.root_dir = root_dir
        self.data = data
        self.mapping = {'arts, crafts & sewing': 0,'books': 1, 'clothing, shoes & jewelry': 2,'electronics': 3, 'grocery & gourmet food': 4,'health & personal care': 5, 'musical instruments': 6, 'patio, lawn & garden': 7,'sports & outdoors': 8, 'toys & games': 9}
        self.cur_map = 0
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        if  type(self.data.iloc[index, 0]) == float:
            t="N/A"
            print('N/A Example  In dataset')
        else:
            t=self.data.iloc[index, 0]
        #print(t)
        tokens = tokenizer.tokenize(t)
        #print(tokens)
        token_length=input_dim
        tokens=tokens+[""] * (token_length-len(tokens))  if len(tokens)<token_length else tokens[:token_length]
        #print(tokens)
        tokens_emb = [vec.stoi.get(token, 0) for token in tokens]
        #print(tokens_emb)
        #print(tokens_emb.shape)
        # print(f"==>> tokens: {tokens}")
        # token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        # print(f"==>> token_ids: {token_ids}")
        
        #input_tensor = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens)).unsqueeze(0)
        category_text = os.path.join(self.data.iloc[index, 1])
        if category_text in self.mapping:
            category = self.mapping[category_text]
        else:
            self.mapping[category_text] = self.cur_map
            print("HOUSTON WE GOT A PROBLEM")
            print(self.mapping)
            category = self.cur_map
            self.cur_map+=1
        # print(f"self.mapping:{self.mapping}")
        return torch.tensor(tokens_emb),category

In [38]:
from transformers import BertTokenizer

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Input text
text = "Hello, how are you doing?"

# Tokenize the text
tokens = tokenizer.tokenize(text)
tokens_emb = [vec.stoi[token] for token in tokens]
print(torch.tensor(tokens_emb))
# Convert tokens to numerical IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)

# Print the results
print("Original Text:", text)
print("Tokens:", tokens)
print("Token IDs:", torch.tensor(token_ids))

tensor([13075,     1,   197,    32,    81,   914,   188])
Original Text: Hello, how are you doing?
Tokens: ['hello', ',', 'how', 'are', 'you', 'doing', '?']
Token IDs: tensor([7592, 1010, 2129, 2024, 2017, 2725, 1029])


In [39]:
#For Transformer,CNN,RNN
d = pd.read_csv(r'data/products_noimg_uniform_LDKEbb.csv')

text_dataset = TextDataset(data=d,root_dir='/')

train,test,trash = torch.utils.data.random_split(text_dataset,[train_size,test_size,1-(train_size+test_size)])
train_dataloader =  DataLoader(train, batch_size=batch_size,
                        shuffle=True, num_workers=0, drop_last=True)
test_dataloader =  DataLoader(test, batch_size=batch_size,
                        shuffle=True, num_workers=0, drop_last=True)
#vocab= build_vocab_from_iterator(train_dataloader,specials=["<unk>"]).to(device)
#vocab.set_default_index(vocab["<unk>"])
# train_iter= IMDB(split="train")
# test_iter = IMDB(split="test")
# tokenizer = get_tokenizer("basic_english")
# def yield_tokens(data_iter):
#     for _, text in data_iter:
#         yield tokenizer(text)
# train_tokens = []
# vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
# vocab.set_default_index(vocab["<unk>"])
# test_tokens = []
# test_tokens = build_vocab_from_iterator(yield_tokens(test_iter), specials=["<unk>"])



In [40]:
#This is an example of what I'm talking about. The Fashion dataset is pretty easy bc it's function alr exists
# but you need to create your own function to make the training data and test_ data datasets

#For Embedding
text_dataset

<__main__.TextDataset at 0x234bf956e50>

Put the dataset into data loader, and check the shape, make sure it's how you want it.


In [41]:
# Create data loaders.
#train_dataloader = DataLoader(text_dataset, batch_size=batch_size,shuffle=True, num_workers=0)
#print(train_dataloader[0])
# for i_batch, sample_batched in enumerate(train_dataloader):
#   print(f"==>> i_batch: {i_batch}")
#   print(f"==>> sample_batched: {len(sample_batched)}")
      #print(i_batch, sample_batched['text'][0][0].item(),
        #sample_batched['category'])

# Display image and label.
#train_features, train_labels = next(iter(train_dataloader))
# print(train_features)
# print(train_labels)
# print(f"Feature batch shape: {train_features.size()}")
# print(f"Labels batch shape: {train_labels.size()}")
# img = train_features[0].squeeze()
# label = train_labels[0]
# plt.imshow(img, cmap="gray")
# plt.show()
# print(f"Label: {label}")

# Setup Model Architecture

1. Create your model architecture in your folder
2. Pick your loss function and optimizer

In [42]:

model = TransformerModel(input_dim,embed_dim,output_dim).to(device)
#device="cpu"
#model = RNN(input_dim,hidden_dim,output_dim,embed_dim).to(device)
#model = MyNetwork(embed_dim,n_filters,filter_size,drop_frac,output_dim,embed_dim).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)
activation_monitor = ActivationMonitor(model)



# Train the model
This should work to train your model. We may have to make some edits for different optimizations, but we can figure it out.


In [43]:
#model = torch.load("models\TransformerModel/0.8-0.2_epochs-0.pt",map_location="cpu")

#test(test_dataloader, model, loss_fn)

In [47]:
for t in range(epochs):
    #with torch.autograd.detect_anomaly():
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer)
        torch.save(model.state_dict(), f"models\{type(model).__name__}\{train_size}-{test_size}_epochs-{t}.pt")
        torch.cuda.empty_cache()
        test(test_dataloader, model, loss_fn)
# model.to("cpu")
# model.to(device)
# print("Done!")

SyntaxError: incomplete input (1657699005.py, line 5)

In [46]:
state_dict = torch.load(f"models\{type(model).__name__}\{train_size}-{test_size}_epochs-{t}_v2.pt")
model.load_state_dict(state_dict)

test(test_dataloader, model, loss_fn)

FileNotFoundError: [Errno 2] No such file or directory: 'models\\TransformerModel\\0.7-0.1_epochs-0_v2.pt'

In [None]:
#print("Number of layers:", len(activation_monitor.activations))
#for i, activation in enumerate(activation_monitor.activations):
#    print(f"Layer {i + 1}: {activation.shape}")


Number of layers: 21856
Layer 1: torch.Size([128, 100, 50])
Layer 2: torch.Size([128, 30, 100])
Layer 3: torch.Size([128, 30, 100])
Layer 4: torch.Size([128, 30, 20])
Layer 5: torch.Size([128, 30, 20])
Layer 6: torch.Size([128, 30, 20])
Layer 7: torch.Size([128, 30, 20])
Layer 8: torch.Size([128, 30, 4])
Layer 9: torch.Size([128, 30, 4])
Layer 10: torch.Size([128, 30, 4])
Layer 11: torch.Size([128, 30, 128])
Layer 12: torch.Size([128, 3840])
Layer 13: torch.Size([128, 3840])
Layer 14: torch.Size([128, 3840])
Layer 15: torch.Size([128, 10])
Layer 16: torch.Size([128, 10])
Layer 17: torch.Size([128, 100, 50])
Layer 18: torch.Size([128, 30, 100])
Layer 19: torch.Size([128, 30, 100])
Layer 20: torch.Size([128, 30, 20])
Layer 21: torch.Size([128, 30, 20])
Layer 22: torch.Size([128, 30, 20])
Layer 23: torch.Size([128, 30, 20])
Layer 24: torch.Size([128, 30, 4])
Layer 25: torch.Size([128, 30, 4])
Layer 26: torch.Size([128, 30, 4])
Layer 27: torch.Size([128, 30, 128])
Layer 28: torch.Size([128