In [1]:
# Add word2vec to the python path.
import sys
sys.path.append("external/word2vec")

In [2]:
import os
import argparse
import yaml
import zipfile

import pandas as pd
import torch
from torch.utils.data import Dataset
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from external.word2vec.train import train
from external.word2vec.utils.helper import (
    get_model_class,
    get_optimizer_class,
    get_lr_scheduler,
    save_vocab,
    load_vocab,
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
CONFIG_PATH = "config.yaml"

In [4]:
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

# Load Data

The corpus used for this training is [Twitter Financial News](https://www.kaggle.com/datasets/sulphatet/twitter-financial-news)

In [5]:
# Read the emotion text data.
emotion_df = pd.read_csv("data/text-emotion.zip")
emotion_df.size

42918

In [6]:
# Dataset for twitter financial news text.
class EmotionTextDataset(Dataset):
    def __init__(self, df: pd.DataFrame, size = -1):
        self.emotion_text = df
        # Shuffle and take a subset of the data.
        if size > 0:
            self.emotion_text = self.emotion_text.sample(frac=1).reset_index(drop=True)
            self.emotion_text = self.emotion_text[:size]
        
    def __len__(self):
        return len(self.emotion_text)
    
    def __getitem__(self, idx):
        return self.emotion_text.iloc[idx, 0]

In [13]:
# Read in the datset.
emotion_dataset = EmotionTextDataset(emotion_df, size = config["dataset_size"])
emotion_dataset[:10]

0                              i didnt feel humiliated
1    i can go from feeling so hopeless to so damned...
2     im grabbing a minute to post i feel greedy wrong
3    i am ever feeling nostalgic about the fireplac...
4                                 i am feeling grouchy
5    ive been feeling a little burdened lately wasn...
6    ive been taking or milligrams or times recomme...
7    i feel as confused about life as a teenager or...
8    i have been with petronas for years i feel tha...
9                                  i feel romantic too
Name: Text, dtype: object

# Load Vocab

In [8]:
if (config["pre_trained_vocab_path"]):
    vocab: Vocab = load_vocab(config["pre_trained_vocab_path"])
    vocab_size = len(vocab.get_stoi())
    print(f"Pretrained vocab size: {vocab_size}")
else:
    vocab = None

Pretrained vocab size: 6630


In [9]:
# Get the english tokenizer.
tokenizer = get_tokenizer("basic_english", language="en")
# Build the extended vocab based on dataset.
extend_vocab = build_vocab_from_iterator(
    map(tokenizer, emotion_dataset),
    specials=["<unk>"],
    min_freq=config["vocab_min_word_frequency"]
)
extend_vocab.set_default_index(extend_vocab["<unk>"])
len(extend_vocab)

870

In [10]:
if vocab:
    new_token = []
    for word in extend_vocab.get_stoi():
        if not word in vocab:
            new_token.append(word)
    # Add all new tokens to the vocab.
    for token in new_token:
        vocab.append_token(token)
    print(f"{len(new_token)} new tokens added to the vocab.")
    vocab_size = len(vocab.get_stoi())
    print(f"Extended vocab size: {vocab_size}")
else:
    vocab = extend_vocab
    vocab_size = len(vocab.get_stoi())
    print(f"Extended vocab size: {vocab_size}")

86 new tokens added to the vocab.
Extended vocab size: 6716


# Transfer Learning

In [11]:
# Get the pretrained model.
if config["pre_trained_model_path"]:
    pretrained_model = torch.load(config["pre_trained_model_path"], map_location=torch.device("cpu"))
else:
    pretrained_model = None

In [12]:
if pretrained_model:
    train(
        config=config,
        data_iter=emotion_dataset,
        vocab=vocab,
        transfer_model=pretrained_model
    )
else:
    train(
        config=config,
        data_iter=emotion_dataset,
        vocab=vocab
    )

Using device: cuda
Vocabulary size: 6716
Adjusting learning rate of group 0 to 2.5000e-02.
Transfer learning enabled. Pre-trained model loaded.
Epoch: 1/16, Train Loss=6.19652, Val Loss=6.05199
Time elapsed: 0.16 min, average epoch time: 0.16 min, predicting finish time: 2.48 min
Adjusting learning rate of group 0 to 2.3438e-02.
Epoch: 2/16, Train Loss=5.96399, Val Loss=6.00583
Time elapsed: 0.30 min, average epoch time: 0.15 min, predicting finish time: 2.43 min
Adjusting learning rate of group 0 to 2.1875e-02.
Epoch: 3/16, Train Loss=5.90004, Val Loss=5.96273
Time elapsed: 0.46 min, average epoch time: 0.15 min, predicting finish time: 2.44 min
Adjusting learning rate of group 0 to 2.0313e-02.
Epoch: 4/16, Train Loss=5.84412, Val Loss=5.94971
Time elapsed: 0.61 min, average epoch time: 0.15 min, predicting finish time: 2.45 min
Adjusting learning rate of group 0 to 1.8750e-02.
Epoch: 5/16, Train Loss=5.80430, Val Loss=5.94428
Time elapsed: 0.76 min, average epoch time: 0.15 min, pred