In [None]:
# Add word2vec to the python path.
import sys
sys.path.append("external/word2vec")

In [None]:
import os
import argparse
import yaml
import zipfile

import pandas as pd
import torch
from torch.utils.data import Dataset
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from external.word2vec.train import train
from external.word2vec.utils.helper import (
    get_model_class,
    get_optimizer_class,
    get_lr_scheduler,
    save_vocab,
    load_vocab,
)

In [None]:
CONFIG_PATH = "config.yaml"

DATA_SET_SIZE = -1

PRE_TRAINED_MODEL_PATH = os.path.join(*["external", "word2vec", "models", "skipgram_blog", "best_val_model_5.67.pt"])
PRE_TRAINED_VOCAB_PATH = os.path.join(*["external", "word2vec", "models", "skipgram_blog", "vocab.pt"])
# PRE_TRAINED_MODEL_PATH = None
# PRE_TRAINED_VOCAB_PATH = None

VOCAB_MIN_WORD_FREQUENCY = 3

In [None]:
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

# Load Data

The corpus used for this training is [Twitter Financial News](https://www.kaggle.com/datasets/sulphatet/twitter-financial-news)

In [None]:
# Read the emotion text data.
emotion_df = pd.read_csv("data/text-emotion.zip")
emotion_df.size

In [None]:
# Dataset for twitter financial news text.
class EmotionTextDataset(Dataset):
    def __init__(self, df: pd.DataFrame, size = -1):
        self.emotion_text = df
        # Shuffle and take a subset of the data.
        if size > 0:
            self.emotion_text = self.emotion_text.sample(frac=1).reset_index(drop=True)
            self.emotion_text = self.emotion_text[:size]
        
    def __len__(self):
        return len(self.emotion_text)
    
    def __getitem__(self, idx):
        return self.emotion_text.iloc[idx, 0]

In [None]:
# Read in the datset.
emotion_dataset = EmotionTextDataset(emotion_df, size = DATA_SET_SIZE)

# Load Vocab

In [None]:
if (PRE_TRAINED_VOCAB_PATH):
    vocab: Vocab = load_vocab(PRE_TRAINED_VOCAB_PATH)
    vocab_size = len(vocab.get_stoi())
    print(f"Pretrained vocab size: {vocab_size}")
else:
    vocab = None

In [None]:
# Get the english tokenizer.
tokenizer = get_tokenizer("basic_english", language="en")
# Build the extended vocab based on dataset.
extend_vocab = build_vocab_from_iterator(
    map(tokenizer, emotion_dataset),
    min_freq=VOCAB_MIN_WORD_FREQUENCY
)
len(extend_vocab)

In [None]:
new_token = []
for word in extend_vocab.get_stoi():
    if not word in vocab:
        new_token.append(word)
# Add all new tokens to the vocab.
for token in new_token:
    vocab.append_token(token)
print(f"{len(new_token)} new tokens added to the vocab.")
vocab_size = len(vocab.get_stoi())
print(f"Extended vocab size: {vocab_size}")

# Transfer Learning

In [None]:
# Get the pretrained model.
pretrained_model = torch.load(PRE_TRAINED_MODEL_PATH, map_location=torch.device("cpu"))

In [None]:
train(
    config=config,
    data_iter=emotion_dataset,
    vocab=vocab,
)