For each batch we need the proportion of pseudo labels to adjust the loss accordingly

In [15]:
from __future__ import annotations

import numpy as np
import torch

from typing import Callable, Tuple
from torch import Tensor

from typing import Iterator


class IncDataLoader:
    """
    Data loader that gradually increases the amount of pseudo data in the training data.
    """
    current_data: Tensor
    current_labels: Tensor

    def __init__(self, labeled_data: Tensor, labels: Tensor, pseudo_data: Tensor, pseudo_labels: Tensor, n_epochs: int) -> None:
        self.lab_data = labeled_data
        self.labels = labels
        self.ps_data = pseudo_data
        self.ps_labels = pseudo_labels
        self.n_epochs = n_epochs

        self.ratio_func = lambda step, loader: step / loader.n_epochs
        self.step = 0

    def __call__(self) -> Tuple[Tensor, Tensor]:
        """ Returns labeled data/labels enriched with a number of samples from the pseudo data/labels. """

        # number of pseudo samples to use
        n_pseudo = self.get_pseudo_number()

        # indexes of those random samples
        pseudo_idx = torch.randperm(len(self.ps_labels))[:n_pseudo]

        # subset of pseudo data and their labels
        ps_sub_data, ps_sub_labels = self.ps_data[pseudo_idx], self.ps_labels[pseudo_idx]

        # concatenate labeled data and the subset of pseudo data
        self.current_data = torch.cat((self.lab_data, ps_sub_data), 0)
        self.current_labels = torch.cat((self.labels, ps_sub_labels))
        
        return self.current_data, self.current_labels
    
    def get_pseudo_ratio(self) -> float:
        """ Returns the ratio of pseudo labels applied to the data. """
        return self.ratio_func(self.step, self)

    def get_pseudo_number(self) -> int:
        """ Returns the number of pseudo labels being currently used. """
        return round(len(self.ps_data) * self.get_pseudo_ratio())

    def get_total_pseudo_ratio(self) -> float:
        """ Returns the proportion of pseudo labels in the returned dataset """
        return self.get_pseudo_number() / len(self.current_data)

    def set_ratio_func(self, func: Callable[[int, IncDataLoader], float]) -> None:
        """ Set a function of time that will be used to calculate the proportion of pseudo data """
        self.ratio_func = func

    def update(self):
        """ Call this after each epoch to update the amount of pseudo data. """
        self.step += 1

In [47]:
# generate a 20 encoded sentences, each sentence consisting of 12 tokens
labeled_data = torch.randint(low=0, high=5, size=(5, 5))
labels = torch.randint(low=0, high=5, size=(5,))

# generate 10 encoded pseudo sentences
pseudo_data = torch.randint(low=6, high=10, size=(5, 5))
pseudo_labels = torch.randint(low=6, high=10,a size=(5,))

In [48]:
print(labeled_data.shape)
labeled_data

torch.Size([5, 5])


tensor([[3, 4, 1, 3, 3],
        [3, 0, 0, 3, 4],
        [0, 4, 0, 0, 1],
        [4, 2, 4, 2, 3],
        [1, 3, 3, 1, 0]])

In [52]:
data_loader = IncDataLoader(
    labeled_data=labeled_data,
    labels=labels,
    pseudo_data=pseudo_data,
    pseudo_labels=pseudo_labels,
    n_epochs=10  # with the default linear function we should see 10% increase of pseudo data each epoch
)

for epoch in range(11):
    data, labs = data_loader()
    print("Epoch", epoch)
    print("No. pseudo samples:", data_loader.get_pseudo_number())
    print("theoretical ratio of pseudo data included:", data_loader.get_pseudo_ratio())
    print("% of pseudo data in the dataset:", data_loader.get_total_pseudo_ratio())
    print("Dataset size:", len(data))
    print(data)
    print()
    data_loader.update()

Epoch 0
No. pseudo samples: 0
theoretical ratio of pseudo data included: 0.0
% of pseudo data in the dataset: 0.0
Dataset size: 5
tensor([[3, 4, 1, 3, 3],
        [3, 0, 0, 3, 4],
        [0, 4, 0, 0, 1],
        [4, 2, 4, 2, 3],
        [1, 3, 3, 1, 0]])

Epoch 1
No. pseudo samples: 0
theoretical ratio of pseudo data included: 0.1
% of pseudo data in the dataset: 0.0
Dataset size: 5
tensor([[3, 4, 1, 3, 3],
        [3, 0, 0, 3, 4],
        [0, 4, 0, 0, 1],
        [4, 2, 4, 2, 3],
        [1, 3, 3, 1, 0]])

Epoch 2
No. pseudo samples: 1
theoretical ratio of pseudo data included: 0.2
% of pseudo data in the dataset: 0.16666666666666666
Dataset size: 6
tensor([[3, 4, 1, 3, 3],
        [3, 0, 0, 3, 4],
        [0, 4, 0, 0, 1],
        [4, 2, 4, 2, 3],
        [1, 3, 3, 1, 0],
        [8, 9, 9, 7, 9]])

Epoch 3
No. pseudo samples: 2
theoretical ratio of pseudo data included: 0.3
% of pseudo data in the dataset: 0.2857142857142857
Dataset size: 7
tensor([[3, 4, 1, 3, 3],
        [3, 0, 0, 

In [63]:
data_loader_v2 = IncDataLoader(
    labeled_data=labeled_data,
    labels=labels,
    pseudo_data=pseudo_data,
    pseudo_labels=pseudo_labels,
    n_epochs=10
)

# exponential growth of pseudo ratio
def exponential(step, loader):
    if step == 0:
        return 0
    return 2 ** (step - 1) / loader.n_epochs

data_loader_v2.set_ratio_func(exponential)

for epoch in range(11):
    data, labs = data_loader_v2()
    print("Epoch", epoch)
    print("No. pseudo samples:", data_loader_v2.get_pseudo_number())
    print("Theoretical % of pseudo data utilized:", data_loader_v2.get_pseudo_ratio() * 100)
    print("% of pseudo data in the dataset:", data_loader_v2.get_total_pseudo_ratio() * 100)
    print("Dataset size:", len(data))
    print(data)
    print()
    data_loader_v2.update()

Epoch 0
No. pseudo samples: 0
Theoretical % of pseudo data utilized: 0
% of pseudo data in the dataset: 0.0
Dataset size: 5
tensor([[3, 4, 1, 3, 3],
        [3, 0, 0, 3, 4],
        [0, 4, 0, 0, 1],
        [4, 2, 4, 2, 3],
        [1, 3, 3, 1, 0]])

Epoch 1
No. pseudo samples: 0
Theoretical % of pseudo data utilized: 10.0
% of pseudo data in the dataset: 0.0
Dataset size: 5
tensor([[3, 4, 1, 3, 3],
        [3, 0, 0, 3, 4],
        [0, 4, 0, 0, 1],
        [4, 2, 4, 2, 3],
        [1, 3, 3, 1, 0]])

Epoch 2
No. pseudo samples: 1
Theoretical % of pseudo data utilized: 20.0
% of pseudo data in the dataset: 16.666666666666664
Dataset size: 6
tensor([[3, 4, 1, 3, 3],
        [3, 0, 0, 3, 4],
        [0, 4, 0, 0, 1],
        [4, 2, 4, 2, 3],
        [1, 3, 3, 1, 0],
        [9, 7, 7, 7, 9]])

Epoch 3
No. pseudo samples: 2
Theoretical % of pseudo data utilized: 40.0
% of pseudo data in the dataset: 28.57142857142857
Dataset size: 7
tensor([[3, 4, 1, 3, 3],
        [3, 0, 0, 3, 4],
        [0

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b2f14aee-af04-4db5-af55-57a3a58b9f40' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>