#How To Build a Model: Step By Step


In [21]:
#Pytorch Imports
%load_ext autoreload
%autoreload 2
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchtext.data import get_tokenizer
from torchtext.datasets import IMDB
from torchtext.transforms import ToTensor
from torchtext.vocab import build_vocab_from_iterator
#Pandas
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split
import time




The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from transformers import AutoTokenizer

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

<contextlib.ExitStack at 0x225de5e3d10>

Pick Your Universal Hyperparamaters:

In [23]:
batch_size = 50
epochs = 10
input_dim = 512
hidden_dim = 100
output_dim = 10
n_filters=128
filter_size=5
drop_frac=0.5
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"==>> torch.cuda.is_available(): {torch.cuda.is_available()}")
print(device)

==>> torch.cuda.is_available(): False
cpu


# Setup Your Model Workspace:

Your model architecture, and all of it's relevent code, will go in a folder at the location ./NNDL/(Your Model Name Here)


# Setup Your Dataset/Data Loaders Here

Pytorch usually takes a csv file. You need to write a function( For naming convention let's call it *create_torch_datasets()* ) that takes the universal dataset kyle got from parsing and seperates/loads it into data that you want your model to train on




In [24]:
#Create Custom Dataset Class:
class TextDataset(Dataset):
    def __init__(self, csv_file, root_dir):
        self.root_dir = root_dir
        self.data = pd.read_csv(csv_file).dropna()
        self.mapping = {}
        self.cur_map = 0
        self.tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
    def __len__(self):
        return len(self.data)
    def __getitem__(self, index):
        if  type(self.data.iloc[index, 0]) == float:
            t="N/A"
            print('N/A Example  In dataset')
        else:
            t=self.data.iloc[index, 0]
        text_o = os.path.join(self.root_dir,
            t)
        tokens = self.tokenizer(text_o, padding='max_length',truncation=True)['input_ids']
        # print(f"==>> tokens: {tokens}")
        # token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        # print(f"==>> token_ids: {token_ids}")
        
        #input_tensor = torch.tensor(self.tokenizer.convert_tokens_to_ids(tokens)).unsqueeze(0)
        category_text = os.path.join(self.root_dir,
            self.data.iloc[index, 1])
        if category_text in self.mapping:
            category = self.mapping[category_text]
        else:
            self.mapping[category_text] = self.cur_map
            print(self.mapping)
            category = self.cur_map
            self.cur_map+=1
        # print(f"self.mapping:{self.mapping}")
        return torch.tensor(tokens,dtype=torch.float32),category

In [25]:
from transformers import BertTokenizer

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Input text
text = "Hello, how are you doing?"

# Tokenize the text
tokens = tokenizer.tokenize(text)

# Convert tokens to numerical IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)

# Print the results
print("Original Text:", text)
print("Tokens:", tokens)
print("Token IDs:", torch.tensor(token_ids))

Original Text: Hello, how are you doing?
Tokens: ['hello', ',', 'how', 'are', 'you', 'doing', '?']
Token IDs: tensor([7592, 1010, 2129, 2024, 2017, 2725, 1029])


In [26]:
#This is an example of what I'm talking about. The Fashion dataset is pretty easy bc it's function alr exists
# but you need to create your own function to make the training data and test_ data datasets
text_dataset = TextDataset(csv_file=r'data\products_noimg_uniform_LDKEbb.csv',root_dir='/')

# for i, sample in enumerate(text_dataset):
#     print(i, len(sample[0]), sample[1])
#generator1 = torch.Generator().manual_seed(42)
train,test,trash = torch.utils.data.random_split(text_dataset,[0.08,0.02,0.9])
train_dataloader =  DataLoader(train, batch_size=batch_size,
                        shuffle=True, num_workers=0, drop_last=True)
test_dataloader =  DataLoader(test, batch_size=batch_size,
                        shuffle=True, num_workers=0, drop_last=True)

# train_iter= IMDB(split="train")
# test_iter = IMDB(split="test")
# tokenizer = get_tokenizer("basic_english")
# def yield_tokens(data_iter):
#     for _, text in data_iter:
#         yield tokenizer(text)
# train_tokens = []
# vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
# vocab.set_default_index(vocab["<unk>"])
# test_tokens = []
# test_tokens = build_vocab_from_iterator(yield_tokens(test_iter), specials=["<unk>"])



Put the dataset into data loader, and check the shape, make sure it's how you want it.


In [27]:
# Create data loaders.
#train_dataloader = DataLoader(text_dataset, batch_size=batch_size,shuffle=True, num_workers=0)
#print(train_dataloader[0])
# for i_batch, sample_batched in enumerate(train_dataloader):
#   print(f"==>> i_batch: {i_batch}")
#   print(f"==>> sample_batched: {len(sample_batched)}")
      #print(i_batch, sample_batched['text'][0][0].item(),
        #sample_batched['category'])

# Display image and label.
#train_features, train_labels = next(iter(train_dataloader))
# print(train_features)
# print(train_labels)
# print(f"Feature batch shape: {train_features.size()}")
# print(f"Labels batch shape: {train_labels.size()}")
# img = train_features[0].squeeze()
# label = train_labels[0]
# plt.imshow(img, cmap="gray")
# plt.show()
# print(f"Label: {label}")

# Setup Model Architecture

1. Create your model architecture in your folder
2. Pick your loss function and optimizer

In [28]:
from NNDL.transformer.architecture import Transformer2
from NNDL.RNN.architecture import RNN,MyNetwork
from NNDL.Utils.solver import train,test
from NNDL.Utils.weight_tracker import ActivationMonitor
layer=nn.TransformerEncoderLayer(d_model=512,nhead=16)
#model = nn.TransformerEncoder(layer,num_layers=12).to(device)
#device="cpu"
#model = RNN(input_dim,hidden_dim,output_dim).to(device)
model = MyNetwork(input_dim,n_filters,filter_size,drop_frac,output_dim).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
activation_monitor = ActivationMonitor(model)



TypeError: MyNetwork.__init__() missing 1 required positional argument: 'num_classes'

# Train the model
This should work to train your model. We may have to make some edits for different optimizations, but we can figure it out.


In [None]:
for t in range(epochs):
    #with torch.autograd.detect_anomaly():
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer)
        test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
{'/musical instruments': 0}
{'/musical instruments': 0, '/books': 1}
{'/musical instruments': 0, '/books': 1, '/sports & outdoors': 2}
{'/musical instruments': 0, '/books': 1, '/sports & outdoors': 2, '/toys & games': 3}
{'/musical instruments': 0, '/books': 1, '/sports & outdoors': 2, '/toys & games': 3, '/patio, lawn & garden': 4}
{'/musical instruments': 0, '/books': 1, '/sports & outdoors': 2, '/toys & games': 3, '/patio, lawn & garden': 4, '/arts, crafts & sewing': 5}
{'/musical instruments': 0, '/books': 1, '/sports & outdoors': 2, '/toys & games': 3, '/patio, lawn & garden': 4, '/arts, crafts & sewing': 5, '/grocery & gourmet food': 6}
{'/musical instruments': 0, '/books': 1, '/sports & outdoors': 2, '/toys & games': 3, '/patio, lawn & garden': 4, '/arts, crafts & sewing': 5, '/grocery & gourmet food': 6, '/clothing, shoes & jewelry': 7}
{'/musical instruments': 0, '/books': 1, '/sports & outdoors': 2, '/toys & games': 3, '/patio, lawn & g

KeyboardInterrupt: 

In [None]:
print("Number of layers:", len(activation_monitor.activations))
for i, activation in enumerate(activation_monitor.activations):
    print(f"Layer {i + 1}: {activation.shape}")


Number of layers: 0
