# Training Settings

In [1]:
import os

In [2]:
MODEL_DIR = os.path.join(*["models", "skipgram"])
MODEL_NAME = "skipgram"

BATCH_SIZE = 4
SHUFFLE = True
DATA_SET_SIZE = 10000

OPTIMIZER = "Adam"
LEARNING_RATE = 0.025
EPOCHS = 32
TRAIN_STEPS = None
VAL_STEPS = None

CHECKPOINT_FREQUENCY = None

# PRE_TRAINED_MODEL_PATH = os.path.join(*["weights", "skipgram_WikiText2", "model.pt"])
# PRE_TRAINED_VOCAB_PATH = os.path.join(*["weights", "skipgram_WikiText2", "vocab.pt"])
PRE_TRAINED_MODEL_PATH = None
PRE_TRAINED_VOCAB_PATH = None

USE_CUDA = True

# Load Data

The corpus used for this training is [Blog Authorship Corpus
](https://www.kaggle.com/datasets/rtatman/blog-authorship-corpus)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset
import torch.nn as nn

from utils.dataloader import get_custom_dataloader_and_vocab
from utils.trainer import Trainer
from utils.helper import (
    get_model_class,
    get_optimizer_class,
    get_lr_scheduler,
    save_vocab,
    load_vocab
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Dataset for blog text.
class BlogDataset(Dataset):
    def __init__(self, path, size = -1):
        self.blog_df = pd.read_csv(path)
        # Shuffle and take a subset of the data.
        if size > 0:
            self.blog_df = self.blog_df.sample(frac=1).reset_index(drop=True)
            self.blog_df = self.blog_df[:size]
        
    def __len__(self):
        return len(self.blog_df)
    
    def __getitem__(self, idx):
        return self.blog_df.iloc[idx, 6]

In [5]:
# Read in the datset.
blog_dataset = BlogDataset(os.path.join(*["data", "blog.zip"]), size=DATA_SET_SIZE)
# Split the dataset into train and validation sets.
train_dataset, val_dataset = train_test_split(blog_dataset, test_size=0.2)
# Get the size of train and validation sets.
len(train_dataset), len(val_dataset)

(8000, 2000)

# Get DataLoader

In [6]:
if (PRE_TRAINED_VOCAB_PATH):
    vocab = load_vocab(PRE_TRAINED_VOCAB_PATH)
else:
    vocab = None

train_loader, vocab = get_custom_dataloader_and_vocab(
    model_name=MODEL_NAME,
    data_iter=train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=SHUFFLE,
    vocab=vocab
)
test_loader, _ = get_custom_dataloader_and_vocab(
    model_name=MODEL_NAME,
    data_iter=val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=SHUFFLE,
    vocab=vocab
)

vocab_size = len(vocab.get_stoi())
vocab_size

2562

In [7]:
# Check current device.
if (USE_CUDA and torch.cuda.is_available()):
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

device

device(type='cuda')

In [8]:
model_class = get_model_class(MODEL_NAME)
model = model_class(vocab_size=vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer_class = get_optimizer_class(OPTIMIZER)
optimizer = optimizer_class(model.parameters(), lr=LEARNING_RATE)
lr_scheduler = get_lr_scheduler(optimizer, EPOCHS, verbose=True)

trainer = Trainer(
    model=model,
    epochs=EPOCHS,
    train_dataloader=train_loader,
    train_steps=TRAIN_STEPS,
    val_dataloader=test_loader,
    val_steps=VAL_STEPS,
    criterion=criterion,
    optimizer=optimizer,
    checkpoint_frequency=CHECKPOINT_FREQUENCY,
    lr_scheduler=lr_scheduler,
    device=device,
    model_dir=MODEL_DIR,
    model_name=MODEL_NAME,
)

if PRE_TRAINED_MODEL_PATH:
    trainer.load_model(PRE_TRAINED_MODEL_PATH)

trainer.train()
print("Training finished.")

trainer.save_model()
trainer.save_loss()
save_vocab(vocab, MODEL_DIR)
print("Model artifacts saved to folder:", MODEL_DIR)

Adjusting learning rate of group 0 to 2.5000e-02.
Epoch: 1/32, Train Loss=5.37179, Val Loss=5.28488
Adjusting learning rate of group 0 to 2.4219e-02.
Epoch: 2/32, Train Loss=5.30688, Val Loss=5.33356
Adjusting learning rate of group 0 to 2.3438e-02.
Epoch: 3/32, Train Loss=5.30013, Val Loss=5.28112
Adjusting learning rate of group 0 to 2.2656e-02.
Epoch: 4/32, Train Loss=5.30280, Val Loss=5.28806
Adjusting learning rate of group 0 to 2.1875e-02.
Epoch: 5/32, Train Loss=5.29809, Val Loss=5.27838
Adjusting learning rate of group 0 to 2.1094e-02.
Epoch: 6/32, Train Loss=5.29697, Val Loss=5.28052
Adjusting learning rate of group 0 to 2.0313e-02.
Epoch: 7/32, Train Loss=5.29381, Val Loss=5.28589
Adjusting learning rate of group 0 to 1.9531e-02.
Epoch: 8/32, Train Loss=5.29285, Val Loss=5.27907
Adjusting learning rate of group 0 to 1.8750e-02.
Epoch: 9/32, Train Loss=5.28747, Val Loss=5.26872
Adjusting learning rate of group 0 to 1.7969e-02.
Epoch: 10/32, Train Loss=5.28192, Val Loss=5.26569