In [1]:
import os
import argparse
import yaml
import zipfile

import pandas as pd
import torch
from torch.utils.data import Dataset
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from train import train
from utils.helper import (
    get_model_class,
    get_optimizer_class,
    get_lr_scheduler,
    save_vocab,
    load_vocab,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CONFIG_PATH = "config.yaml"

DATA_SET_SIZE = 1000

PRE_TRAINED_MODEL_PATH = os.path.join(*["models", "skipgram_blog", "best_val_model_5.67.pt"])
PRE_TRAINED_VOCAB_PATH = os.path.join(*["models", "skipgram_blog", "vocab.pt"])
# PRE_TRAINED_MODEL_PATH = None
# PRE_TRAINED_VOCAB_PATH = None

VOCAB_MIN_WORD_FREQUENCY = 3

In [3]:
with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

# Load Data

The corpus used for this training is [Twitter Financial News](https://www.kaggle.com/datasets/sulphatet/twitter-financial-news)

In [4]:
# Load the zip file.
twitter_zip = zipfile.ZipFile(os.path.join(*["data", "twitter_financial_news.zip"]))
# Read the train_data and valid_data into dataframes.
train_df = pd.read_csv(twitter_zip.open("train_data.csv"))
valid_df = pd.read_csv(twitter_zip.open("valid_data.csv"))
# Combine the dataframes.
dataset_df = pd.concat([train_df, valid_df])
dataset_df.head()

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0


In [5]:
# Dataset for twitter financial news text.
class TwitterFinancialNewsDataset(Dataset):
    def __init__(self, df: pd.DataFrame, size = -1):
        self.twitter_financial_news_df = df
        # Shuffle and take a subset of the data.
        if size > 0:
            self.twitter_financial_news_df = self.twitter_financial_news_df.sample(frac=1).reset_index(drop=True)
            self.twitter_financial_news_df = self.twitter_financial_news_df[:size]
        
    def __len__(self):
        return len(self.twitter_financial_news_df)
    
    def __getitem__(self, idx):
        return self.twitter_financial_news_df.iloc[idx, 0]

In [6]:
# Read in the datset.
twitter_dataset = TwitterFinancialNewsDataset(dataset_df, size=DATA_SET_SIZE)

# Load Vocab

In [7]:
if (PRE_TRAINED_VOCAB_PATH):
    vocab: Vocab = load_vocab(PRE_TRAINED_VOCAB_PATH)
    vocab_size = len(vocab.get_stoi())
    print(f"Pretrained vocab size: {vocab_size}")
else:
    vocab = None

Pretrained vocab size: 6630


In [8]:
# Get the english tokenizer.
tokenizer = get_tokenizer("basic_english", language="en")
# Build the extended vocab based on dataset.
extend_vocab = build_vocab_from_iterator(
    map(tokenizer, twitter_dataset),
    min_freq=VOCAB_MIN_WORD_FREQUENCY
)
len(extend_vocab)

1207

In [9]:
new_token = []
for word in extend_vocab.get_stoi():
    if not word in vocab:
        new_token.append(word)
# Add all new tokens to the vocab.
for token in new_token:
    vocab.append_token(token)
print(f"{len(new_token)} new tokens added to the vocab.")
vocab_size = len(vocab.get_stoi())
print(f"Extended vocab size: {vocab_size}")

349 new tokens added to the vocab.
Extended vocab size: 6979


# Transfer Learning

In [10]:
# Get the pretrained model.
pretrained_model = torch.load(PRE_TRAINED_MODEL_PATH, map_location=torch.device("cpu"))

In [11]:
train(
    config=config,
    data_iter=twitter_dataset,
    vocab=vocab,
)

Using device: cpu
Vocabulary size: 6979
Adjusting learning rate of group 0 to 2.5000e-02.
Epoch: 1/16, Train Loss=6.72336, Val Loss=6.28910
Time elapsed: 0.18 min, average epoch time: 0.18 min, predicting finish time: 2.84 min
Adjusting learning rate of group 0 to 2.3438e-02.
Epoch: 2/16, Train Loss=5.69461, Val Loss=6.10489
Time elapsed: 0.36 min, average epoch time: 0.18 min, predicting finish time: 2.84 min
Adjusting learning rate of group 0 to 2.1875e-02.
Epoch: 3/16, Train Loss=5.41644, Val Loss=5.96714
Time elapsed: 0.53 min, average epoch time: 0.18 min, predicting finish time: 2.84 min
Adjusting learning rate of group 0 to 2.0313e-02.
Epoch: 4/16, Train Loss=5.18913, Val Loss=6.01272
Time elapsed: 0.71 min, average epoch time: 0.18 min, predicting finish time: 2.83 min
Adjusting learning rate of group 0 to 1.8750e-02.
Epoch: 5/16, Train Loss=5.07249, Val Loss=5.94625
Time elapsed: 0.89 min, average epoch time: 0.18 min, predicting finish time: 2.85 min
Adjusting learning rate o