# Models

## Common Code

Common functions for training and fine-tuning the BERT models.

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
print("INIT module_path: ", module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

DATA_DIR = module_path + "/data"
MODEL_DIR = module_path + "/model"

for data_dir in [DATA_DIR, MODEL_DIR]:
    os.makedirs(data_dir, exist_ok=True)

INIT module_path:  c:\Users\Alan\Desktop\Open_Source\BERT-TLSA-paper


In [None]:
from transformers import BertTokenizerFast, BertForMaskedLM, BertModel, BertConfig
import torch
import enum
from typing import cast

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


class BERTSentimentClassifier(torch.nn.Module):
    def __init__(self, bert: BertModel):
        self.bert = bert
        self.linear = torch.nn.Sequential(
            torch.nn.Linear(bert.config.hidden_size, 1), # Convert from the hidden state to a single output
            torch.nn.Sigmoid() # Constrain output between 0 and 1
        )
        self.loss_fn = torch.nn.CrossEntropyLoss()
    
    def forward(self, expected_rating: torch.FloatTensor, *args, **kwargs) -> dict:
        bert_output = self.bert(*args, **kwargs)
        # https://huggingface.co/transformers/v3.2.0/model_doc/bert.html
        # Pooler output is last layer of hidden state for [CLS] token, whose
        # output is fed through a linear layer and a tanh function
        #
        # Shape of (batch_size, hidden_size) 
        output: torch.FloatTensor = self.linear(bert_output.pooler_output)
        loss = self.loss_fn(output, expected_rating)
        return {
            "norm_rating": output,
            "loss": loss,
        }
    

def bucketize_norm_ratings(norm_ratings: torch.FloatTensor, max_score: int) -> torch.IntTensor:
    """
    Convert norm_ratings into specific numbers
    """
    bucket_size = 1 / max_score
    return torch.div(norm_ratings, bucket_size, rounding_mode="floor")


class TrainingConfig:
    class StopMode(enum.Enum):
        EPOCH = 0,
        DELTA_LOSS = 1,

    def __init__(self, tokenizer: BertTokenizerFast, model: BertForMaskedLM, mode: StopMode = StopMode.EPOCH, epochs: int = 20, stop_delta_loss: float = 0.01, epoch_window: int = 5, name: str = "model"):
        self.tokenizer = tokenizer
        self.model = model
        self.reset_optim()
        self.epochs = epochs
        self.name = name
        self.mode = mode
        # We stop when the average delta loss <= stop_delta_loss
        # This average is taken over a window of size epoch_window 
        self.stop_delta_loss = stop_delta_loss
        # Number of epochs from now to the past used to calculate the average delta loss
        self.epoch_window = epoch_window
        # The final classifier model, after finetuning
        self.finetuned_classifier_model = cast(BERTSentimentClassifier, None)
    
    def reset_optim(self):
        self.optim = torch.optim.AdamW(self.model.parameters(), lr=1e-5)

    def __str__(self):
        return str(self.__dict__)
    
    def __repr__(self):
        return str(self)


def build_training_config(pretrained_model: bool = False, pretrained_model_name: str = "bert-base-uncased", mode: TrainingConfig.StopMode = TrainingConfig.StopMode.EPOCH, epochs: int = 20, stop_delta_loss: float = 0.01, epoch_window: int = 5, name: str = "model") -> TrainingConfig:
    # NOTE: We are not re-training a tokenizer, since it's out of the scope of this experiment
    tokenizer: BertTokenizerFast = BertTokenizerFast.from_pretrained(pretrained_model_name)
    if pretrained_model:
        # Create tokenizer + already trained model
        model: BertForMaskedLM = BertForMaskedLM.from_pretrained(pretrained_model_name)
    else:
        config = BertConfig(
            vocab_size=tokenizer.vocab_size,
            max_position_embeddings=512,
            hidden_size=256,
            num_hidden_layers=4,
            num_attention_heads=4,
            type_vocab_size=2,
        )
        model: BertForMaskedLM = BertForMaskedLM(config)
    # Move the model to the device we speicified
    #   Ideally use CUDA (GPU) if available
    model.to(device)
    return TrainingConfig(tokenizer=tokenizer, model=model, mode=mode, epochs=epochs, stop_delta_loss=stop_delta_loss, epoch_window=epoch_window, name=name)

print(f"Training on device: {device}")

Training on device: cuda


In [None]:
from transformers import BertTokenizerFast
import pandas as pd
import torch
from typing import TypedDict, cast


class TokenizedInputs(TypedDict):
    input_ids: torch.IntTensor
    token_type_ids: torch.IntTensor
    attention_mask: torch.IntTensor
    labels: torch.IntTensor


class MaskedTextDatasetItem(TokenizedInputs):
    original_text: str


class MaskedTextDataset(torch.utils.data.Dataset[MaskedTextDatasetItem]):
    """
    Dataset of masked text
    """
    def __init__(self, encodings: TokenizedInputs, original_text: list[str] = None):
        self.encodings = encodings
        self.original_text = original_text

    def __getitem__(self, index: int) -> MaskedTextDatasetItem:
        # Return the dictionary just like encodings, except it only
        # contains the entries for a specific row (sentence)
        res = {key: val[index] for key, val in self.encodings.items() }
        if self.original_text:
            res["original_text"] = self.original_text[index]
        return res
    
    def __len__(self):
        return len(self.encodings["input_ids"])
    
    def to(self, device: torch.device):
        for key in self.encodings:
            if isinstance(self.encodings[key], torch.Tensor):
                self.encodings[key] = cast(torch.Tensor, self.encodings[key]).to(device=device)


class ReviewsDatasetItem(TokenizedInputs):
    original_text: str
    norm_rating: float
    rating: int
    max_score: int


class ReviewsDataset(torch.utils.data.Dataset[ReviewsDatasetItem]):
    """
    Dataset of reviews and their normaliezd ratings (decimal number from 0 to 1)
    """
    def __init__(self, encodings: TokenizedInputs, ratings: list[int], max_score: int, original_text: list[str] = None):
        self.encodings = encodings
        self.ratings = ratings
        self.max_score = max_score
        self.original_text = original_text
    
    def __getitem__(self, index: int) -> ReviewsDatasetItem:
        # Return the dictionary just like encodings, except it only
        # contains the entries for a specific row (sentence)
        res = {key: val[index] for key, val in self.encodings.items() }
        if self.original_text:
            res["original_text"] = self.original_text[index]
        res["norm_rating"] = float(self.ratings[index]) / self.max_score
        res["rating"] = self.ratings[index]
        res["max_score"] = self.max_score
        return res

    def __len__(self):
        return len(self.encodings["input_ids"])
    
    def to(self, device: torch.device):
        for key in self.encodings:
            if isinstance(self.encodings[key], torch.Tensor):
                self.encodings[key] = cast(torch.Tensor, self.encodings[key]).to(device=device)


def read_review_texts(data_file: str = "myanimelist_reviews.csv") -> list[str]:
    data_df = pd.read_csv(f"{DATA_DIR}/{data_file}")
    return data_df["review"].astype(str).to_list()


def get_masked_dataset(tokenizer: BertTokenizerFast, review_texts: list[str], percent_masked: float = 0.15, dataset_file: str = "") -> MaskedTextDataset:
    print("get_masked_dataset:")
    if dataset_file:
        print(f"    Loading existing dataset file @ {dataset_file}...")
        # Check if dataset file exists — if so, then load from file
        try:
            # Disable weights_only since we are loading aribitrary python classes
            masked_text_dataset: MaskedTextDataset = torch.load(f"{DATA_DIR}/{dataset_file}", weights_only=False)
            return masked_text_dataset
        except Exception as e:
            print(f"    Error loading dataset file: {e}")
    print(f"    Creating new dataset file @ {dataset_file}...")
    encodings: TokenizedInputs = tokenizer(review_texts, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    # Create a new field labels that is a clone of input_ids
    encodings["labels"] = encodings["input_ids"].detach().clone()
    # In BERT paper, each token has 15% chance of being masked
    # First, create random vector that spans all of the input_ids (spans all the tokens)
    rand = torch.rand(encodings["input_ids"].shape)
    # (rand < 0.15) -> Any token that has a corresponding random value of < 0.15, we mask
    # We also don't want to mask special tokens (101, 102), and padding tokens (0)
    # * operator is elementwise multiplication, which is same as AND for boolean tensors
    mask_arr = (rand < percent_masked) * (encodings["input_ids"] != 101) * (encodings["input_ids"] != 0) * (encodings["input_ids"] != 102)
    # Stores all the indices that we want to mask
    masked_cols = []
    # Iterate over each row in the mask_arr (basically each sentence in our text data)
    for i in range(mask_arr.shape[0]):
        # .nonzero() -> finds the indicies where we have "true" values (since true = 1 and false = 0 in pytorch)
        masked_cols.append(mask_arr[i].nonzero().flatten().tolist())
    # Apply our mask_arr in each row (each sentence)
    for i in range(mask_arr.shape[0]):
        # Special Tensor syntax -> we can pass in a list of indicies for any of the axes
        #   In this case, we pass in a list of indices in the column axis, to effectively
        #   select the columns (tokens) we want to mask out
        encodings["input_ids"][i, masked_cols[i]] = 103
    masked_text_dataset = MaskedTextDataset(encodings, review_texts)
    if dataset_file:
        torch.save(masked_text_dataset, f"{DATA_DIR}/{dataset_file}")
    return masked_text_dataset


def get_reviews_dataset(tokenizer: BertTokenizerFast, norm_ratings: list[float], review_texts: list[str], dataset_file: str = "") -> ReviewsDataset:
    print("get_reviews_dataset:")
    if dataset_file:
        print(f"    Loading existing dataset file @ {dataset_file}...")
        # Check if dataset file exists — if so, then load from file
        try:
            # Disable weights_only since we are loading aribitrary python classes
            reviews_dataset: ReviewsDataset = torch.load(f"{DATA_DIR}/{dataset_file}", weights_only=False)
            return reviews_dataset
        except Exception as e:
            print(f"    Error loading dataset file: {e}")
    encodings: TokenizedInputs = tokenizer(review_texts, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    reviews_dataset = ReviewsDataset(encodings, norm_ratings, review_texts)
    print(f"    Creating new dataset file @ {dataset_file}...")
    if dataset_file:
        torch.save(reviews_dataset, f"{DATA_DIR}/{dataset_file}")
    return reviews_dataset

In [None]:
from IPython.display import clear_output
from tqdm import tqdm
from transformers.modeling_outputs import MaskedLMOutput
from typing import cast, Iterator
import matplotlib.pyplot as plt
import itertools
from collections import deque
from typing import TypeVar, Generic
import torch.utils.data as tdata
import sklearn.model_selection as skms
import math
import copy
from typing import Callable
import numpy as np


TDataset = TypeVar('T', bound=tdata.Dataset)
class TrainDatasetSplit(Generic[TDataset]):
    def __init__(self, train: tdata.Subset[TDataset], test: tdata.Subset[TDataset], valid: tdata.Subset[TDataset]):
        self.train = train
        self.test = test
        self.valid = valid
    
    def get_dataloaders(self, batch_size: int = None, shuffle: bool = None, drop_last: bool = False) -> tuple[tdata.DataLoader, tdata.DataLoader, tdata.DataLoader]:
        return (tdata.DataLoader(self.train, shuffle=shuffle, batch_size=batch_size, drop_last=drop_last), \
                tdata.DataLoader(self.test, shuffle=shuffle, batch_size=batch_size, drop_last=drop_last), \
                tdata.DataLoader(self.valid, shuffle=shuffle, batch_size=batch_size, drop_last=drop_last))


def train_split_dataset(dataset: TDataset, train_percent: float = 0.7, test_percent: float = 0.2, validate_percent: float = 0.1, random_state: int = None) -> TrainDatasetSplit[TDataset]:
    assert math.isclose(train_percent + test_percent + validate_percent, 1.0), "Expected train_percent + test_percent + validate_percent = 1.0!"
    dataset_indices = list(range(len(dataset)))
    test_train_indices, valid_indicies = skms.train_test_split(dataset_indices, test_size=validate_percent, random_state=random_state)
    train_indicies, test_indicies = skms.train_test_split(test_train_indices, test_size=test_percent/(train_percent + test_percent), random_state=random_state)
    valid = tdata.Subset(dataset, valid_indicies)
    train = tdata.Subset(dataset, train_indicies)
    test = tdata.Subset(dataset, test_indicies)
    split = TrainDatasetSplit(train=train, test=test, valid=valid)
    return split


def get_newest_model_file(prefix: str) -> str:
    idx = -1
    while os.path.exists(f"{prefix}{idx + 1}.pt"):
        idx += 1
    if idx >= 0:
        return f"{prefix}{idx}.pt"
    return ""


def pretraining_loop(train_dataloader: tdata.DataLoader[MaskedTextDatasetItem], valid_dataloader: tdata.DataLoader[MaskedTextDatasetItem], config: TrainingConfig):
    config.reset_optim()
    model, optim = config.model, config.optim
    log_data: list[dict] = []
    init_epoch = 0
    
    model_dir = f"{MODEL_DIR}/{config.name}/"
    os.makedirs(model_dir, exist_ok=True)
    
    existing_model_file = get_newest_model_file(model_dir + "model_pretrain_")
    if existing_model_file:
        # If model file exist, then try to continue off of where it left off
        data = torch.load()
        model.load_state_dict(data["model"])
        optim.load_state_dict(data["optim"])
        init_epoch = data["epoch"]
        log_data = data["log_data"]
        delta_loss_queue = [x["valid_loss"] for x in log_data[:-config.epoch_window]]
        delta_loss_moving_avg = np.average(delta_loss_queue)
        print(f"Resuming existing model at epoch: {epoch}, valid loss: {log_data[-1]["valid_loss"]}, delta_loss: {-delta_loss_moving_avg} <= {config.stop_delta_loss}")
        if len(delta_loss_queue) == config.epoch_window and -delta_loss_moving_avg <= config.stop_delta_loss:
            # We hit the cutoff, return early
            return
        # Otherwise, continue training

    print(f"Pretraining start for '{config.name}'...")
    if config.mode == TrainingConfig.StopMode.DELTA_LOSS:
        epoch_loop = itertools.count(start=init_epoch, step=1)
    else:
        epoch_loop = range(init_epoch, config.epochs)
    delta_loss_queue: deque[float] = deque()
    delta_loss_moving_avg: float = 0
    prev_average_valid_loss = None
    for epoch in epoch_loop:
        # Training loop
        loop = tqdm(cast(Iterator[MaskedTextDatasetItem], train_dataloader), leave=True)
        total_train_loss: float = 0
        model.train()
        for data in loop:
            # Reset gradient
            optim.zero_grad()

            outputs: MaskedLMOutput = model(data["input_ids"], attention_mask=data["attention_mask"], labels=data["labels"])
            loss = outputs.loss

            # Apply backward propagation
            loss.backward()
            optim.step()

            # Set info in tqdm progress bar
            loop.set_description(f"Epoch: {epoch}")
            loop.set_postfix(loss=loss.item())
            total_train_loss += loss.item()
        average_train_loss = total_train_loss / len(train_dataloader)

        print(f"Average training loss: {average_train_loss}")

        # Validation loop
        loop = tqdm(cast(Iterator[MaskedTextDatasetItem], valid_dataloader), leave=True)
        total_valid_loss: float = 0
        model.eval()
        with torch.no_grad():
            for data in loop:
                outputs: MaskedLMOutput = model(data["input_ids"], attention_mask=data["attention_mask"], labels=data["labels"])
                loss = outputs.loss

                # Set info in tqdm progress bar
                loop.set_postfix(loss=loss.item())
                total_valid_loss += loss.item()
        average_valid_loss = total_valid_loss / len(valid_dataloader)

        print(f"Average validation loss: {average_valid_loss}")

        # Update moving average of validation loss
        if prev_average_valid_loss != None and TrainingConfig.StopMode.DELTA_LOSS:
            loss_delta = average_valid_loss - prev_average_valid_loss
            delta_loss_queue.append(loss_delta)
            # Add new loss to moving average
            delta_loss_moving_avg += loss_delta / config.epoch_window
            if len(delta_loss_queue) > config.epoch_window:
                # Remove oldest loss from moving average
                delta_loss_moving_avg -= delta_loss_queue.popleft() / config.epoch_window
        prev_average_valid_loss = average_valid_loss

        # Make plot
        log_data.append({
            "epoch": epoch,
            "train_loss": average_train_loss,
            "valid_loss": average_valid_loss
        })
        epochs = [x["epoch"] for x in log_data]
        for key in log_data[0]:
            if key.startswith("_") or key == "epoch":
                continue
            plt.plot(epochs, [item[key] for item in log_data])
        plt.plot()
        clear_output(wait=True)
        plt.show()
        torch.save({
            "model": model.state_dict,
            "optim": optim.state_dict,
            "log_data": log_data,
        }, model_dir + f"model_pretrain_{epoch}.pt")

        # Break if our moving validation delta loss average is smaller than our stop_delta_loss
        if config.mode == TrainingConfig.StopMode.DELTA_LOSS:
            print(f"Average delta loss ({config.epoch_window} epoch window): {-delta_loss_moving_avg} <= {config.stop_delta_loss}")
            if len(delta_loss_queue) >= config.epoch_window:
                # Only start trying to exit once delta_loss_queue is filled, such that we have enough datapoints to
                # calculate an average using a full window of epochs
                #
                # Delta loss is negative if we are decreaseing -> this is what we want
                if -delta_loss_moving_avg <= config.stop_delta_loss:
                    break


def finetuning_loop(train_dataloader: tdata.DataLoader[ReviewsDatasetItem], valid_dataloader: tdata.DataLoader[ReviewsDatasetItem], config: TrainingConfig):
    config.reset_optim()
    bert_copy = copy.deepcopy(config.model.bert)
    model = BERTSentimentClassifier(bert_copy)
    optim = config.optim
    init_epoch = 0
    log_data: list[dict] = []
    
    model_dir = f"{MODEL_DIR}/{config.name}/"
    os.makedirs(model_dir, exist_ok=True)

    existing_model_file = get_newest_model_file(model_dir + "model_finetune_")
    if existing_model_file:
        # If model file exist, then try to continue off of where it left off
        data = torch.load()
        model.load_state_dict(data["model"])
        optim.load_state_dict(data["optim"])
        init_epoch = data["epoch"]
        log_data = data["log_data"]
        delta_loss_queue = [x["valid_loss"] for x in log_data[:-config.epoch_window]]
        delta_loss_moving_avg = np.average(delta_loss_queue)
        print(f"Resuming existing model at epoch: {epoch}, valid loss: {log_data[-1]["valid_loss"]}, delta_loss: {-delta_loss_moving_avg} <= {config.stop_delta_loss}")
        if len(delta_loss_queue) == config.epoch_window and -delta_loss_moving_avg <= config.stop_delta_loss:
            # We hit the cutoff, return early
            return
        # Otherwise, continue training
    
    print(f"Finetuning start for '{config.name}'...")
    if config.mode == TrainingConfig.StopMode.DELTA_LOSS:
        epoch_loop = itertools.count(start=init_epoch, step=1)
    else:
        epoch_loop = range(init_epoch, config.epochs)
    delta_loss_queue: deque[float] = deque()
    delta_loss_moving_avg: float = 0
    prev_average_valid_loss = None
    for epoch in epoch_loop:
        # Training loop
        loop = tqdm(cast(Iterator[ReviewsDatasetItem], train_dataloader), leave=True)
        model.train()
        total_train_loss: float = 0
        for data in loop:
            # Reset gradient
            optim.zero_grad()

            output = model(data["input_ids"], attention_mask=data["attention_mask"], labels=data["labels"])
            loss = output["loss"]

            # Apply backward propagation
            loss.backward()
            optim.step()

            # Set info in tqdm progress bar
            loop.set_description(f"Epoch: {epoch}")
            loop.set_postfix(loss=loss.item())
            total_train_loss += loss.item()
        average_train_loss = total_train_loss / len(train_dataloader)
        
        print(f"Average training loss: {average_train_loss}")

        # Validation loop
        loop = tqdm(cast(Iterator[ReviewsDatasetItem], valid_dataloader), leave=True)
        total_valid_loss: float = 0
        total_correct: int = 0
        model.eval()
        with torch.no_grad():
            for data in loop:
                output = model(data["input_ids"], attention_mask=data["attention_mask"])
                norm_ratings = output["norm_rating"]
                max_score = output["max_score"]
                pred_rating = bucketize_norm_ratings(norm_ratings, max_score)
                correct_preds = torch.count_nonzero(pred_rating == max_score)
                total_correct += correct_preds
                
                loss = output["loss"]

                # Set info in tqdm progress bar
                loop.set_postfix(loss=loss.item())
                total_valid_loss += loss.item()
        average_valid_loss = total_valid_loss / len(valid_dataloader)
        accuracy = total_correct / len(valid_dataloader.dataset)

        print(f"Average validation loss: {average_valid_loss}, Accuracy: {accuracy:.4f}")

        # Update moving average of validation loss
        if prev_average_valid_loss != None and TrainingConfig.StopMode.DELTA_LOSS:
            loss_delta = average_valid_loss - prev_average_valid_loss
            delta_loss_queue.append(loss_delta)
            # Add new loss to moving average
            delta_loss_moving_avg += loss_delta / config.epoch_window
            if len(delta_loss_queue) > config.epoch_window:
                # Remove oldest loss from moving average
                delta_loss_moving_avg -= delta_loss_queue.popleft() / config.epoch_window
        prev_average_valid_loss = average_valid_loss

        # Make plot
        log_data.append({
            "epoch": epoch,
            "train_loss": average_train_loss,
            "valid_loss": average_valid_loss,
            "accuracy": accuracy
        })
        epochs = [x["epoch"] for x in log_data]
        for key in log_data[0]:
            if key.startswith("_") or key == "epoch":
                continue
            plt.plot(epochs, [item[key] for item in log_data])
        plt.plot()
        clear_output(wait=True)
        plt.show()
        torch.save({
            "model": model.state_dict,
            "optim": optim.state_dict,
            "log_data": log_data,
        }, model_dir + f"model_finetune_{epoch}.pt")

        # Break if our moving validation delta loss average is smaller than our stop_delta_loss
        if config.mode == TrainingConfig.StopMode.DELTA_LOSS:
            print(f"Average delta loss ({config.epoch_window} epoch window): {-delta_loss_moving_avg} <= {config.stop_delta_loss}")
            if len(delta_loss_queue) >= config.epoch_window:
                # Only start trying to exit once delta_loss_queue is filled, such that we have enough datapoints to
                # calculate an average using a full window of epochs
                #
                # Delta loss is negative if we are decreaseing -> this is what we want
                if -delta_loss_moving_avg <= config.stop_delta_loss:
                    break

    config.model.finetuned_classifier_model = model


def dataset_train_loop(dataset_name: str = "myanimelist") -> Callable[[], None]:
    review_texts = read_review_texts(f"{dataset_name}_reviews.csv")
    display(review_texts[:10])
    
    config = build_training_config(name=dataset_name, mode=TrainingConfig.StopMode.DELTA_LOSS, stop_delta_loss=0.05, epoch_window=5)
    display(config)

    masked_dataset = get_masked_dataset(config.tokenizer, review_texts, dataset_file=f"{dataset_name}_masked_data.dt")
    masked_dataset.to(device)

    split = train_split_dataset(masked_dataset, train_percent=0.7, test_percent=0.2, validate_percent=0.1)
    train_dataloader, test_dataloader, valid_dataloader = split.get_dataloaders(batch_size=32, shuffle=True)
    print(f"train_dataloader: ({len(train_dataloader)} batches)\ntest_dataloader: ({len(test_dataloader)} batches)\nvalid_dataloader: ({len(valid_dataloader)} batches)")
    display(next(iter(train_dataloader)))

    pretraining_loop(train_dataloader, valid_dataloader, config)
 

## MyAnimeList

Training BERT on MyAnimeList only.

In [None]:
dataset_train_loop("myanimelist")

## Steam

Training BERT on Steam only.

In [None]:
dataset_train_loop("steam")

["Amazing game. Easily 30-40 hours of game play. I hope the dev continues to add more depth in the future! I'd love to see,- More zones, cities, villages etc- Additional starting scenarios (in debt, being a dealer for someone else, certain supplies/drugs not available)- Setups (undercover cops/narcs)- Raids/counter-raids (Police/cartel)- Thief's stealing supplies (if door left open, or employees turning against you)- Mixing drugs with other drugs (weed w/ cocaine)- Turf wars w/ opponent drug-lord organisation charts that you can war with, slowly discover their hierarchy and order hits or do them yourself- High volume deals (shipping supplies off to other regions)",
 'day time comes: pickpocket the whole city night time comes: go gambling with the boys (drugs sold 0) 10/10 game',
 'Made a type of weed that the game called Tokyo Shart that gave the smoker pitch black skin and makes them bald.',
 "A woman came up to me and asked for weed, didn't like the price I offered, proceeded to stab

## Metacritic

Training BERT on Metacritic only.

In [None]:
dataset_train_loop("metacritic")

['Boring game that was soulless even in the beginning. Its contemporaries Roadblocks (why is this censored lol) and Blockland do everything better. Easily the most overrated game of all time.',
 'the best, the only, the unique game in its genre.\nThis game gave me my childhood, THIS IS THE BEST GAME',
 'Not my cup of tea due to so many choices. Still pretty great. Memorable style of world. Diverse & expansive gameplay. Major impact on the industry.',
 'Film He was Very average More I gave Credit By the References And also For the Nostalgia The jokes They are not Funny One Except Put Some well Specific More on al He was Good More or less',
 'Minecraft é muito criativo, dá muita liberdade para os jogadores, possui um vasto lore próprio, muitos detalhes, investimento e desenvolvimento contínuo ao longo de anos após o lançamento, meus amigo sempre gostaram, sempre foi muito popular no youtube e na twitch...\n\nE mesmo assim eu nunca gostei tanto. Talvez pelo gênero de sobrevivência que não

## Rotten Tomatoes

Training BERT on Rotten Tomatoes only.

In [None]:
dataset_train_loop("rotten_tomatoes")

['Una obra de arte dirigida por Denis Villeneuve. Esta película es una de las mejores que he visto en este género. Sin necesidad de leer los libros, comprenderás a la perfección todo el contexto. El director busca deleitarnos con sus tomas cinematográficas y su espectacular sonido ambiental. Una obra de arte digna de un Oscar. El único problema que le veo, y por el que no le doy un 10, es el comienzo. Puede ser muy tedioso, pero se entiende perfectamente por qué.\n\nConclusión: Esta es una película que definitivamente tienes que ver en algún momento. El comienzo puede ser lento porque te da mucho contexto. No pierdas la oportunidad de verla.\n\nCalificación: 8.5/10',
 'Took me a while to get into it, but as someone who has not read the Dune books, it truly feels like a book. I can see the influence and see how the book must have been like, I definitely plan to read the books at some point. Some of the decisions are a bit sub-par though. They should 100% have had a different cast. Yes, 