In [8]:
!pip install transformers
!pip install torchsummary
!pip install datasets



In [9]:
from google.colab import drive
drive.mount('/content/drive')

# Create a folder
!mkdir -p "/content/drive/My Drive/LLM-projects-models"

Mounted at /content/drive


In [10]:
from transformers import ElectraModel, ElectraConfig, ElectraForMaskedLM, ElectraTokenizer, PretrainedConfig, ElectraForTokenClassification
import numpy as np
from torchsummary import summary
import torch
from torch import nn
from datasets import load_dataset
from torch.utils.data import DataLoader
import random
from tqdm import tqdm
import matplotlib.pyplot as plt

# Some tests, you can directly go to next part

In [11]:
config = ElectraConfig()
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')


discriminator = ElectraModel.from_pretrained('google/electra-small-discriminator')
inputs = tokenizer("hello my dog is cute", return_tensors="pt")
#print(inputs)
outputs = discriminator(**inputs, )
last_hidden_states = outputs.last_hidden_state
#print(last_hidden_states, last_hidden_states.shape)

inputs = tokenizer("The capital of France is [MASK].", return_tensors="pt")
labels = tokenizer("The capital of France is Paris.", return_tensors="pt")
print(inputs, labels)
model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')
outputs = model(**inputs, labels=labels["input_ids"])
loss = outputs.loss
logits = outputs.logits
print(torch.argmax(logits, dim = -1))
print(tokenizer.decode(torch.squeeze(torch.argmax(logits, dim = -1)).tolist()))

{'input_ids': tensor([[ 101, 1996, 3007, 1997, 2605, 2003,  103, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])} {'input_ids': tensor([[ 101, 1996, 3007, 1997, 2605, 2003, 3000, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[1012, 1996, 3007, 1997, 2605, 2003, 3000, 1012, 1012]])
. the capital of france is paris..


In [12]:
print(tokenizer.mask_token_id)

103


In [13]:
print(discriminator)

ElectraModel(
  (embeddings): ElectraEmbeddings(
    (word_embeddings): Embedding(30522, 128, padding_idx=0)
    (position_embeddings): Embedding(512, 128)
    (token_type_embeddings): Embedding(2, 128)
    (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
  (encoder): ElectraEncoder(
    (layer): ModuleList(
      (0-11): 12 x ElectraLayer(
        (attention): ElectraAttention(
          (self): ElectraSelfAttention(
            (query): Linear(in_features=256, out_features=256, bias=True)
            (key): Linear(in_features=256, out_features=256, bias=True)
            (value): Linear(in_features=256, out_features=256, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): ElectraSelfOutput(
            (dense): Linear(in_features=256, out_features=256, bias=True)
            (LayerNorm): LayerNorm((

In [14]:
"""generator_config = {
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": True,
  "transformers_version": "4.36.2",
  "type_vocab_size": 2,
  "use_cache": True,
  "vocab_size": 30522
}

discriminator_config = {
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 1024,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "electra",
  "num_attention_heads": 4,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "summary_activation": "gelu",
  "summary_last_dropout": 0.1,
  "summary_type": "first",
  "summary_use_proj": True,
  "transformers_version": "4.36.2",
  "type_vocab_size": 2,
  "use_cache": True,
  "vocab_size": 30522
}"""

'generator_config = {\n  "attention_probs_dropout_prob": 0.1,\n  "embedding_size": 128,\n  "hidden_act": "gelu",\n  "hidden_dropout_prob": 0.1,\n  "hidden_size": 256,\n  "initializer_range": 0.02,\n  "intermediate_size": 1024,\n  "layer_norm_eps": 1e-12,\n  "max_position_embeddings": 512,\n  "model_type": "electra",\n  "num_attention_heads": 4,\n  "num_hidden_layers": 12,\n  "pad_token_id": 0,\n  "position_embedding_type": "absolute",\n  "summary_activation": "gelu",\n  "summary_last_dropout": 0.1,\n  "summary_type": "first",\n  "summary_use_proj": True,\n  "transformers_version": "4.36.2",\n  "type_vocab_size": 2,\n  "use_cache": True,\n  "vocab_size": 30522\n}\n\ndiscriminator_config = {\n  "attention_probs_dropout_prob": 0.1,\n  "embedding_size": 128,\n  "hidden_act": "gelu",\n  "hidden_dropout_prob": 0.1,\n  "hidden_size": 256,\n  "initializer_range": 0.02,\n  "intermediate_size": 1024,\n  "layer_norm_eps": 1e-12,\n  "max_position_embeddings": 512,\n  "model_type": "electra",\n  "n

# Load dataset

In [32]:
#Load IMDB dataset

#Input = masked data
#label = real data

#input = prediction of generator
#output = binary output

#We will masked 20% of the input

#103 is the encoding for [MASK]

def masking(x , mask_token = 103):
    k = int(len(x)*1/10)
    ids = random.sample(range(1, len(x)-1), k)
    for i in ids:
        x[i] = mask_token
    return x, ids


tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
dataset = load_dataset("scikit-learn/imdb", split="train")

def preprocessing_fn(x, tokenizer):
    generator_labels = tokenizer.encode(
        x["review"],
        add_special_tokens=False,
        truncation=True,
        max_length=128,
        padding=False,
        return_attention_mask=False,
    )
    x["input_ids"], masked_ids = masking(generator_labels, 103)

    attention_mask = torch.ones((len(x["input_ids"])))
    x["labels"] = generator_labels
    x["attention_mask"] = attention_mask
    x["masked_ids"] = masked_ids
    return x

print(len(dataset))

n_samples = 1000

# We first shuffle the data !
dataset = dataset.shuffle()

# Select n_samples
splitted_dataset = dataset.select(range(n_samples))

# Tokenize the dataset
splitted_dataset = splitted_dataset.map(
    preprocessing_fn, fn_kwargs={"tokenizer": tokenizer}
)


# Remove useless columns
splitted_dataset = splitted_dataset.select_columns(["input_ids", "attention_mask", "labels", "masked_ids"])

# Split the train and validation
splitted_dataset = splitted_dataset.train_test_split(test_size=0.2)

train_set = splitted_dataset["train"]
valid_set = splitted_dataset["test"]


50000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [33]:
class DataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        max_label_length = max(len(feature["input_ids"]) for feature in batch)
        max_length_ids = max(len(feature["masked_ids"]) for feature in batch)
        for feature in batch:
            current_length = len(feature["input_ids"])
            current_length_mask = len(feature["masked_ids"])
            remainder = [tokenizer.pad_token_id] * (max_label_length - current_length)

            feature["input_ids"] = feature["input_ids"] + remainder
            feature["labels"] = feature["input_ids"][1:]
            feature["labels"] = [
                feature["labels"][i] if i < current_length - 1 else -100
                for i in range(max_label_length - 1)
            ]
            feature["input_ids"] = feature["input_ids"][:-1]
            feature["attention_mask"] = [
                1 if x < current_length else 0 for x in range(max_label_length - 1)
            ]
            feature["masked_ids"] = [
                feature["masked_ids"][i] if i < current_length_mask - 1 else -1
                for i in range(max_length_ids - 1)
            ]
        features = {
            "input_ids": torch.tensor([f["input_ids"] for f in batch]),
            "attention_mask": torch.tensor([f["attention_mask"] for f in batch]),
            "labels": torch.tensor([f["labels"] for f in batch]),
            "masked_ids": torch.tensor([f["masked_ids"] for f in batch]),
        }
        return features

print(train_set[0].keys())
print(len(train_set[0]["input_ids"]), len(train_set[0]["attention_mask"]), len(train_set[0]["labels"]))
print(len(train_set[1]["input_ids"]), len(train_set[1]["attention_mask"]), len(train_set[1]["labels"]))

data_collator = DataCollator(tokenizer)

batch_size = 32

train_dataloader = DataLoader(
    train_set, batch_size=batch_size, collate_fn=data_collator
)
valid_dataloader = DataLoader(
    valid_set, batch_size=batch_size, collate_fn=data_collator
)
n_valid = len(valid_set)
n_train = len(train_set)

batch = next(iter(train_dataloader))
print(batch["input_ids"].shape, batch["labels"].shape, batch["attention_mask"].shape)

dict_keys(['input_ids', 'attention_mask', 'labels', 'masked_ids'])
128 128 128
128 128 128
torch.Size([32, 127]) torch.Size([32, 127]) torch.Size([32, 127])


# Model instantiation

In [34]:
class GeneratorHead(nn.Module):
    """Prediction module for the generator, made up of two dense layers."""
    def __init__(self, config):
        super().__init__()

        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
        self.dense2 = nn.Linear(config.embedding_size, config.vocab_size)

    def forward(self, generator_hidden_states):
        hidden_states = self.dense(generator_hidden_states)
        hidden_states = torch.nn.GELU()(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        hidden_states = self.dense2(hidden_states)
        #hidden_states = torch.nn.Softmax(dim = -1)(hidden_states)

        return hidden_states

class Generator(nn.Module):
    """Complete generator module"""
    def __init__(self, generator_body, generator_head):
        super().__init__()
        self.generator_body = generator_body
        self.generator_head = generator_head

    def forward(self, input):
      output = self.generator_body(input_ids=input["input_ids"], attention_mask=input["attention_mask"]).last_hidden_state
      output = self.generator_head(output)
      return output

class DiscriminatorHead(nn.Module):
    """Discriminator module for the generator, made up of two dense layers."""
    def __init__(self, config):
        super().__init__()

        self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps)
        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
        self.dense2 = nn.Linear(config.embedding_size, 2)

    def forward(self, discriminator_hidden_states):
        hidden_states = self.dense(discriminator_hidden_states)
        hidden_states = torch.nn.GELU()(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        hidden_states = self.dense2(hidden_states)

        return hidden_states

class Discriminator(nn.Module):
    """Complete Discriminator"""
    def __init__(self, discriminator_body, discriminator_head):
        super().__init__()
        self.discriminator_body = discriminator_body
        self.discriminator_head = discriminator_head

    def forward(self, input):
      output = self.discriminator_body(input_ids=input["input_ids"], attention_mask=input["attention_mask"]).last_hidden_state
      output = self.discriminator_head(output)
      return output

class ELECTRALoss():
  def __init__(self, loss_weights=(1.0, 50.0)):
    self.loss_weights = loss_weights
    self.gen_loss_fc = nn.CrossEntropyLoss(ignore_index = -100)
    self.disc_loss_fc = nn.BCEWithLogitsLoss()

  def __call__(self, generator_pred, discriminator_pred, generator_labels, masked_ids):
    list_gen_pred = []
    list_gen_labels = []
    for i in range(masked_ids.shape[0]):
      for j in range(masked_ids.shape[1]):
        masked_index = masked_ids[i,j]
        list_gen_pred.append(torch.unsqueeze(generator_pred[i,masked_index,:],dim =0))
        list_gen_labels.append(torch.unsqueeze(generator_labels[i,masked_index], dim = 0))
    list_gen_pred = torch.concat(list_gen_pred, axis = 0)
    list_gen_labels = torch.concat(list_gen_labels, axis = 0)
    gen_loss = self.gen_loss_fc(list_gen_pred, list_gen_labels)
    gen_acc = torch.sum(torch.argmax(list_gen_pred, dim = -1) == list_gen_labels)/ len(list_gen_labels)*100

    generated_tokens = torch.argmax(generator_pred, dim = -1)

    discriminator_labels =torch.ones((discriminator_pred.shape[0], discriminator_pred.shape[1])).cuda()
    for i in range(masked_ids.shape[0]):
      for j in range(masked_ids.shape[1]):
        masked_index = masked_ids[i,j]
        if generated_tokens[i,masked_index] != generator_labels[i,masked_index]:
          discriminator_labels[i,masked_index] = 0
    discriminator_labels_one_hot =  nn.functional.one_hot(discriminator_labels.to(torch.int64).view(discriminator_labels.shape[0]*discriminator_labels.shape[1])).float()
    discriminator_pred = discriminator_pred.view(discriminator_pred.shape[0]*discriminator_pred.shape[1],discriminator_pred.shape[2])
    disc_loss = self.disc_loss_fc(discriminator_pred, discriminator_labels_one_hot)
    disc_acc = torch.sum(torch.argmax(discriminator_pred.view(-1), dim = -1) == discriminator_labels)/ len(discriminator_labels)*100
    return gen_loss * self.loss_weights[0] + disc_loss * self.loss_weights[1], gen_acc, disc_acc

In [38]:
generator_config = ElectraConfig(vocab_size = tokenizer.vocab_size, embedding_size= 64, hidden_size = 128)
generator_body = ElectraModel(generator_config)
generator_head = GeneratorHead(generator_config)
generator = Generator(generator_body, generator_head)

discriminator_config = ElectraConfig(vocab_size = tokenizer.vocab_size)
discriminator_body = ElectraModel(discriminator_config)
discriminator_head = DiscriminatorHead(discriminator_config)
discriminator = Discriminator(discriminator_body,discriminator_head)

loss = ELECTRALoss()

# Training

In [39]:
def training(generator, discriminator, n_epochs, train_dataloader, val_dataloader, loss, tokenizer, lr=5e-5):

    #trained_gen = ElectraForMaskedLM.from_pretrained('google/electra-small-generator').cuda()
    optimizer_gen = torch.optim.AdamW(
        generator.parameters(),
        lr=lr,
        eps=1e-08,
    )
    optimizer_disc = torch.optim.AdamW(
        discriminator.parameters(),
        lr=lr,
        eps=1e-08,
    )
    list_train_loss = []
    list_val_loss = []
    best_val_loss = 10**50
    list_train_gen_acc = []
    list_train_disc_acc = []
    list_val_gen_acc = []
    list_val_disc_acc = []
    generator.cuda()
    discriminator.cuda()
    for e in range(n_epochs):
        # ========== Training ==========

        # Set model to training mode
        generator.train()
        discriminator.train()

        # Tracking variables
        train_loss = 0
        train_gen_acc = 0
        train_disc_acc = 0
        for batch in tqdm(train_dataloader):
            batch = {k: v.cuda() for k, v in batch.items()}

            input_ids, attention_mask, labels, masked_ids =(
                batch["input_ids"],
                batch["attention_mask"],
                batch["labels"],
                batch["masked_ids"]
            )
            batch.pop("labels")
            batch.pop("masked_ids")
            optimizer_gen.zero_grad()
            optimizer_disc.zero_grad()
            # Forward pass
            generator_pred = generator(batch)
            #generator_pred = trained_gen(**batch).logits
            predicted_tokens = torch.argmax(generator_pred, dim = -1)
            batch_discriminator = batch

            input_ids_for_discriminator = labels
            for i in range(len(batch)):
              for id in masked_ids[i,:]:
                if id != -1:
                  input_ids_for_discriminator[i][id] = predicted_tokens[i][id]
            input_ids_for_discriminator[input_ids_for_discriminator == -100] = tokenizer.pad_token_id
            batch_discriminator["input_ids"] = input_ids_for_discriminator

            batch_discriminator = {k: v.cuda() for k, v in batch_discriminator.items()}
            discriminator_pred = discriminator(batch_discriminator)

            # Backward pass
            loss_value, train_gen_acc_value, train_disc_acc_value = loss(generator_pred, discriminator_pred,labels, masked_ids)
            loss_value.backward()
            optimizer_gen.step()
            optimizer_disc.step()
            train_loss += loss_value.detach().cpu().item()
            train_gen_acc += train_gen_acc_value
            train_disc_acc += train_disc_acc_value
            print("Batch training loss is", loss_value)
        list_train_loss.append(train_loss / len(train_dataloader))
        list_train_gen_acc.append(train_gen_acc / len(train_dataloader))
        list_train_disc_acc.append(train_disc_acc / len(train_dataloader))
        # ========== Validation ==========
        if e%1 == 0:
          generator.eval()
          discriminator.eval()
          valid_loss = 0
          val_gen_acc = 0
          val_disc_acc = 0
          for batch in tqdm(val_dataloader):
              batch = {k: v.cuda() for k, v in batch.items()}

              input_ids, attention_mask, labels, masked_ids =(
                  batch["input_ids"],
                  batch["attention_mask"],
                  batch["labels"],
                  batch["masked_ids"]
              )
              batch.pop("labels")
              batch.pop("masked_ids")

              # Forward pass
              generator_pred = generator(batch)
              predicted_tokens = torch.argmax(generator_pred, dim = -1)
              batch_discriminator = batch

              input_ids_for_discriminator = labels
              for i in range(len(batch)):
                for id in masked_ids[i,:]:
                  if id != -1:
                    input_ids_for_discriminator[i][id] = predicted_tokens[i][id]
              input_ids_for_discriminator[input_ids_for_discriminator == -100] = tokenizer.pad_token_id
              batch_discriminator["input_ids"] = input_ids_for_discriminator

              batch_discriminator = {k: v.cuda() for k, v in batch_discriminator.items()}
              discriminator_pred = discriminator(batch_discriminator)

              # Backward pass
              loss_value, val_gen_acc_value, val_disc_acc_value = loss(generator_pred, discriminator_pred,labels, masked_ids)

              valid_loss += loss_value.detach().cpu().item()
              val_gen_acc += val_gen_acc_value
              val_disc_acc += val_disc_acc_value
        list_val_loss.append(valid_loss / len(val_dataloader))
        list_val_gen_acc.append(val_gen_acc_value / len(val_dataloader))
        list_val_disc_acc.append(val_disc_acc / len(val_dataloader))
        if list_val_loss[-1] < best_val_loss:
          best_val_loss = list_val_loss[-1]
          generator_path = f"/content/drive/My Drive/LLM-projects-models/generator_epoch{e}_lr{lr}"
          discriminator_path =  f"/content/drive/My Drive/LLM-projects-models/discriminator_epoch{e}_lr{lr}"
          torch.save(generator.state_dict(), generator_path)
          torch.save(discriminator.state_dict(), discriminator_path)


        print(
            e,
            "\n\t - Train loss: {:.4f}".format(list_train_loss[-1]),
            "\n\t - Val loss: {:.4f}".format(list_val_loss[-1]),
            "\n\t - Train gen acc: {:.4f}".format(list_train_gen_acc[-1]),
            "\n\t - Train disc acc: {:.4f}".format(list_train_disc_acc[-1]),
            "\n\t - Val gen acc: {:.4f}".format(list_val_gen_acc[-1]),
            "\n\t - Val disc acc: {:.4f}".format(list_val_disc_acc[-1]),
        )
    return list_train_loss, list_val_loss, list_train_gen_acc, list_train_disc_acc, list_val_gen_acc, list_val_disc_acc

In [None]:
list_train_loss, list_val_loss, list_train_gen_acc, list_train_disc_acc, list_val_gen_acc, list_val_disc_acc = training(generator, discriminator, 10, train_dataloader, valid_dataloader, loss, tokenizer, lr=5e-2)

  4%|▍         | 1/25 [00:03<01:22,  3.44s/it]

Batch training loss is tensor(42.5251, device='cuda:0', grad_fn=<AddBackward0>)


  8%|▊         | 2/25 [00:06<01:16,  3.31s/it]

Batch training loss is tensor(25.6422, device='cuda:0', grad_fn=<AddBackward0>)


 12%|█▏        | 3/25 [00:09<01:12,  3.28s/it]

Batch training loss is tensor(123.6281, device='cuda:0', grad_fn=<AddBackward0>)


 16%|█▌        | 4/25 [00:13<01:08,  3.26s/it]

Batch training loss is tensor(28.2587, device='cuda:0', grad_fn=<AddBackward0>)


 20%|██        | 5/25 [00:16<01:04,  3.24s/it]

Batch training loss is tensor(149.5104, device='cuda:0', grad_fn=<AddBackward0>)


 24%|██▍       | 6/25 [00:19<01:01,  3.24s/it]

Batch training loss is tensor(31.5858, device='cuda:0', grad_fn=<AddBackward0>)


 28%|██▊       | 7/25 [00:22<00:58,  3.24s/it]

Batch training loss is tensor(37.0925, device='cuda:0', grad_fn=<AddBackward0>)


 32%|███▏      | 8/25 [00:26<00:55,  3.24s/it]

Batch training loss is tensor(35.6216, device='cuda:0', grad_fn=<AddBackward0>)


 36%|███▌      | 9/25 [00:29<00:51,  3.23s/it]

Batch training loss is tensor(31.1903, device='cuda:0', grad_fn=<AddBackward0>)


 40%|████      | 10/25 [00:32<00:48,  3.23s/it]

Batch training loss is tensor(26.1196, device='cuda:0', grad_fn=<AddBackward0>)


 44%|████▍     | 11/25 [00:35<00:45,  3.22s/it]

Batch training loss is tensor(20.8131, device='cuda:0', grad_fn=<AddBackward0>)


 48%|████▊     | 12/25 [00:38<00:41,  3.23s/it]

Batch training loss is tensor(25.1059, device='cuda:0', grad_fn=<AddBackward0>)


 52%|█████▏    | 13/25 [00:42<00:38,  3.22s/it]

Batch training loss is tensor(27.9345, device='cuda:0', grad_fn=<AddBackward0>)


 56%|█████▌    | 14/25 [00:45<00:35,  3.22s/it]

Batch training loss is tensor(22.8789, device='cuda:0', grad_fn=<AddBackward0>)


 60%|██████    | 15/25 [00:48<00:32,  3.21s/it]

Batch training loss is tensor(20.2255, device='cuda:0', grad_fn=<AddBackward0>)


 64%|██████▍   | 16/25 [00:51<00:28,  3.21s/it]

Batch training loss is tensor(21.4555, device='cuda:0', grad_fn=<AddBackward0>)


 68%|██████▊   | 17/25 [00:55<00:25,  3.22s/it]

Batch training loss is tensor(22.3020, device='cuda:0', grad_fn=<AddBackward0>)


 72%|███████▏  | 18/25 [00:58<00:22,  3.22s/it]

Batch training loss is tensor(23.5743, device='cuda:0', grad_fn=<AddBackward0>)


 76%|███████▌  | 19/25 [01:01<00:19,  3.21s/it]

Batch training loss is tensor(24.0664, device='cuda:0', grad_fn=<AddBackward0>)


 80%|████████  | 20/25 [01:04<00:16,  3.21s/it]

Batch training loss is tensor(22.7715, device='cuda:0', grad_fn=<AddBackward0>)


 84%|████████▍ | 21/25 [01:07<00:12,  3.22s/it]

Batch training loss is tensor(22.5859, device='cuda:0', grad_fn=<AddBackward0>)


 88%|████████▊ | 22/25 [01:11<00:09,  3.22s/it]

Batch training loss is tensor(20.3885, device='cuda:0', grad_fn=<AddBackward0>)


 92%|█████████▏| 23/25 [01:14<00:06,  3.22s/it]

Batch training loss is tensor(19.7525, device='cuda:0', grad_fn=<AddBackward0>)


 96%|█████████▌| 24/25 [01:17<00:03,  3.21s/it]

Batch training loss is tensor(20.9310, device='cuda:0', grad_fn=<AddBackward0>)


100%|██████████| 25/25 [01:20<00:00,  3.23s/it]


Batch training loss is tensor(21.5125, device='cuda:0', grad_fn=<AddBackward0>)


100%|██████████| 7/7 [00:00<00:00,  7.03it/s]


0 
	 - Train loss: 34.6989 
	 - Val loss: 20.4537 
	 - Train gen acc: 12.0000 
	 - Train disc acc: 0.0000 
	 - Val gen acc: 5.1948 
	 - Val disc acc: 0.0000


  4%|▍         | 1/25 [00:03<01:17,  3.25s/it]

Batch training loss is tensor(19.8805, device='cuda:0', grad_fn=<AddBackward0>)


  8%|▊         | 2/25 [00:06<01:14,  3.22s/it]

Batch training loss is tensor(19.0612, device='cuda:0', grad_fn=<AddBackward0>)


 12%|█▏        | 3/25 [00:09<01:10,  3.22s/it]

Batch training loss is tensor(18.7545, device='cuda:0', grad_fn=<AddBackward0>)


 16%|█▌        | 4/25 [00:12<01:07,  3.21s/it]

Batch training loss is tensor(20.1560, device='cuda:0', grad_fn=<AddBackward0>)


 20%|██        | 5/25 [00:16<01:04,  3.22s/it]

Batch training loss is tensor(20.4644, device='cuda:0', grad_fn=<AddBackward0>)


 24%|██▍       | 6/25 [00:19<01:01,  3.22s/it]

Batch training loss is tensor(21.2383, device='cuda:0', grad_fn=<AddBackward0>)


 28%|██▊       | 7/25 [00:22<00:57,  3.21s/it]

Batch training loss is tensor(19.7409, device='cuda:0', grad_fn=<AddBackward0>)


 32%|███▏      | 8/25 [00:25<00:54,  3.21s/it]

Batch training loss is tensor(20.1856, device='cuda:0', grad_fn=<AddBackward0>)


 36%|███▌      | 9/25 [00:28<00:51,  3.22s/it]

Batch training loss is tensor(19.7089, device='cuda:0', grad_fn=<AddBackward0>)


 40%|████      | 10/25 [00:32<00:48,  3.23s/it]

Batch training loss is tensor(19.6779, device='cuda:0', grad_fn=<AddBackward0>)


 44%|████▍     | 11/25 [00:35<00:45,  3.22s/it]

Batch training loss is tensor(19.6365, device='cuda:0', grad_fn=<AddBackward0>)


 48%|████▊     | 12/25 [00:38<00:41,  3.22s/it]

Batch training loss is tensor(19.8504, device='cuda:0', grad_fn=<AddBackward0>)


 52%|█████▏    | 13/25 [00:41<00:38,  3.21s/it]

Batch training loss is tensor(19.3805, device='cuda:0', grad_fn=<AddBackward0>)


 56%|█████▌    | 14/25 [00:45<00:35,  3.22s/it]

Batch training loss is tensor(19.3060, device='cuda:0', grad_fn=<AddBackward0>)


 60%|██████    | 15/25 [00:48<00:32,  3.21s/it]

Batch training loss is tensor(19.3683, device='cuda:0', grad_fn=<AddBackward0>)


 64%|██████▍   | 16/25 [00:51<00:28,  3.21s/it]

Batch training loss is tensor(19.4697, device='cuda:0', grad_fn=<AddBackward0>)


 68%|██████▊   | 17/25 [00:54<00:25,  3.21s/it]

Batch training loss is tensor(19.1860, device='cuda:0', grad_fn=<AddBackward0>)


 72%|███████▏  | 18/25 [00:57<00:22,  3.22s/it]

Batch training loss is tensor(19.3401, device='cuda:0', grad_fn=<AddBackward0>)


 76%|███████▌  | 19/25 [01:01<00:19,  3.23s/it]

Batch training loss is tensor(19.5649, device='cuda:0', grad_fn=<AddBackward0>)


 80%|████████  | 20/25 [01:04<00:16,  3.22s/it]

Batch training loss is tensor(19.1676, device='cuda:0', grad_fn=<AddBackward0>)


 84%|████████▍ | 21/25 [01:07<00:12,  3.22s/it]

Batch training loss is tensor(19.9355, device='cuda:0', grad_fn=<AddBackward0>)


 88%|████████▊ | 22/25 [01:10<00:09,  3.21s/it]

Batch training loss is tensor(19.4291, device='cuda:0', grad_fn=<AddBackward0>)


 92%|█████████▏| 23/25 [01:14<00:06,  3.22s/it]

Batch training loss is tensor(19.3024, device='cuda:0', grad_fn=<AddBackward0>)


 96%|█████████▌| 24/25 [01:17<00:03,  3.22s/it]

Batch training loss is tensor(19.5564, device='cuda:0', grad_fn=<AddBackward0>)


100%|██████████| 25/25 [01:20<00:00,  3.22s/it]


Batch training loss is tensor(18.9245, device='cuda:0', grad_fn=<AddBackward0>)


100%|██████████| 7/7 [00:00<00:00,  8.73it/s]


1 
	 - Train loss: 19.6114 
	 - Val loss: 19.1809 
	 - Train gen acc: 13.3295 
	 - Train disc acc: 0.0000 
	 - Val gen acc: 5.1948 
	 - Val disc acc: 0.0000


  4%|▍         | 1/25 [00:03<01:17,  3.21s/it]

Batch training loss is tensor(18.8982, device='cuda:0', grad_fn=<AddBackward0>)


  8%|▊         | 2/25 [00:06<01:14,  3.23s/it]

Batch training loss is tensor(19.9092, device='cuda:0', grad_fn=<AddBackward0>)


 12%|█▏        | 3/25 [00:09<01:11,  3.24s/it]

Batch training loss is tensor(19.0770, device='cuda:0', grad_fn=<AddBackward0>)


 16%|█▌        | 4/25 [00:12<01:07,  3.23s/it]

Batch training loss is tensor(19.4441, device='cuda:0', grad_fn=<AddBackward0>)


 20%|██        | 5/25 [00:16<01:04,  3.22s/it]

Batch training loss is tensor(19.0158, device='cuda:0', grad_fn=<AddBackward0>)


 24%|██▍       | 6/25 [00:19<01:01,  3.22s/it]

Batch training loss is tensor(19.3734, device='cuda:0', grad_fn=<AddBackward0>)


 28%|██▊       | 7/25 [00:22<00:58,  3.22s/it]

Batch training loss is tensor(18.5436, device='cuda:0', grad_fn=<AddBackward0>)


 32%|███▏      | 8/25 [00:25<00:54,  3.22s/it]

Batch training loss is tensor(19.1703, device='cuda:0', grad_fn=<AddBackward0>)


 36%|███▌      | 9/25 [00:28<00:51,  3.21s/it]

Batch training loss is tensor(19.1292, device='cuda:0', grad_fn=<AddBackward0>)


 40%|████      | 10/25 [00:32<00:48,  3.21s/it]

Batch training loss is tensor(18.9275, device='cuda:0', grad_fn=<AddBackward0>)


 44%|████▍     | 11/25 [00:35<00:45,  3.22s/it]

Batch training loss is tensor(18.5513, device='cuda:0', grad_fn=<AddBackward0>)


 48%|████▊     | 12/25 [00:38<00:41,  3.23s/it]

Batch training loss is tensor(19.0601, device='cuda:0', grad_fn=<AddBackward0>)


 52%|█████▏    | 13/25 [00:41<00:38,  3.22s/it]

Batch training loss is tensor(19.2129, device='cuda:0', grad_fn=<AddBackward0>)


 56%|█████▌    | 14/25 [00:45<00:35,  3.22s/it]

Batch training loss is tensor(19.3990, device='cuda:0', grad_fn=<AddBackward0>)


 60%|██████    | 15/25 [00:48<00:32,  3.22s/it]

Batch training loss is tensor(19.2679, device='cuda:0', grad_fn=<AddBackward0>)


 64%|██████▍   | 16/25 [00:51<00:28,  3.22s/it]

Batch training loss is tensor(19.4892, device='cuda:0', grad_fn=<AddBackward0>)


 68%|██████▊   | 17/25 [00:54<00:25,  3.22s/it]

Batch training loss is tensor(18.9923, device='cuda:0', grad_fn=<AddBackward0>)


 72%|███████▏  | 18/25 [00:57<00:22,  3.21s/it]

Batch training loss is tensor(19.1769, device='cuda:0', grad_fn=<AddBackward0>)


 76%|███████▌  | 19/25 [01:01<00:19,  3.21s/it]

Batch training loss is tensor(19.3913, device='cuda:0', grad_fn=<AddBackward0>)


 80%|████████  | 20/25 [01:04<00:16,  3.23s/it]

Batch training loss is tensor(18.8002, device='cuda:0', grad_fn=<AddBackward0>)


 84%|████████▍ | 21/25 [01:07<00:12,  3.23s/it]

Batch training loss is tensor(19.5295, device='cuda:0', grad_fn=<AddBackward0>)


 88%|████████▊ | 22/25 [01:10<00:09,  3.22s/it]

Batch training loss is tensor(18.7804, device='cuda:0', grad_fn=<AddBackward0>)


 92%|█████████▏| 23/25 [01:14<00:06,  3.22s/it]

Batch training loss is tensor(18.7714, device='cuda:0', grad_fn=<AddBackward0>)


 96%|█████████▌| 24/25 [01:17<00:03,  3.22s/it]

Batch training loss is tensor(19.0982, device='cuda:0', grad_fn=<AddBackward0>)


100%|██████████| 25/25 [01:20<00:00,  3.22s/it]


Batch training loss is tensor(18.5606, device='cuda:0', grad_fn=<AddBackward0>)


100%|██████████| 7/7 [00:00<00:00,  8.27it/s]


2 
	 - Train loss: 19.1028 
	 - Val loss: 19.1319 
	 - Train gen acc: 13.3295 
	 - Train disc acc: 0.0000 
	 - Val gen acc: 5.1948 
	 - Val disc acc: 1684.3751


  4%|▍         | 1/25 [00:03<01:16,  3.21s/it]

Batch training loss is tensor(19.0533, device='cuda:0', grad_fn=<AddBackward0>)


  8%|▊         | 2/25 [00:06<01:13,  3.21s/it]

Batch training loss is tensor(19.1744, device='cuda:0', grad_fn=<AddBackward0>)


 12%|█▏        | 3/25 [00:09<01:10,  3.21s/it]

Batch training loss is tensor(18.9882, device='cuda:0', grad_fn=<AddBackward0>)


 16%|█▌        | 4/25 [00:12<01:07,  3.22s/it]

Batch training loss is tensor(19.3847, device='cuda:0', grad_fn=<AddBackward0>)


 20%|██        | 5/25 [00:16<01:04,  3.22s/it]

Batch training loss is tensor(18.7647, device='cuda:0', grad_fn=<AddBackward0>)


 24%|██▍       | 6/25 [00:19<01:01,  3.22s/it]

Batch training loss is tensor(19.5236, device='cuda:0', grad_fn=<AddBackward0>)


 28%|██▊       | 7/25 [00:22<00:57,  3.22s/it]

Batch training loss is tensor(18.5406, device='cuda:0', grad_fn=<AddBackward0>)


 32%|███▏      | 8/25 [00:25<00:54,  3.21s/it]

Batch training loss is tensor(19.2597, device='cuda:0', grad_fn=<AddBackward0>)


 36%|███▌      | 9/25 [00:28<00:51,  3.22s/it]

Batch training loss is tensor(19.2286, device='cuda:0', grad_fn=<AddBackward0>)


 40%|████      | 10/25 [00:32<00:48,  3.22s/it]

Batch training loss is tensor(19.0169, device='cuda:0', grad_fn=<AddBackward0>)


 44%|████▍     | 11/25 [00:35<00:44,  3.21s/it]

Batch training loss is tensor(18.5606, device='cuda:0', grad_fn=<AddBackward0>)


 48%|████▊     | 12/25 [00:38<00:41,  3.21s/it]

Batch training loss is tensor(18.9272, device='cuda:0', grad_fn=<AddBackward0>)


 52%|█████▏    | 13/25 [00:41<00:38,  3.22s/it]

Batch training loss is tensor(19.0291, device='cuda:0', grad_fn=<AddBackward0>)


 56%|█████▌    | 14/25 [00:45<00:35,  3.23s/it]

Batch training loss is tensor(19.2791, device='cuda:0', grad_fn=<AddBackward0>)


 60%|██████    | 15/25 [00:48<00:32,  3.22s/it]

Batch training loss is tensor(19.0343, device='cuda:0', grad_fn=<AddBackward0>)


 64%|██████▍   | 16/25 [00:51<00:28,  3.22s/it]

Batch training loss is tensor(19.0518, device='cuda:0', grad_fn=<AddBackward0>)


 68%|██████▊   | 17/25 [00:54<00:25,  3.21s/it]

Batch training loss is tensor(18.5833, device='cuda:0', grad_fn=<AddBackward0>)


 72%|███████▏  | 18/25 [00:57<00:22,  3.23s/it]

Batch training loss is tensor(18.8093, device='cuda:0', grad_fn=<AddBackward0>)


 76%|███████▌  | 19/25 [01:01<00:19,  3.22s/it]

Batch training loss is tensor(19.0916, device='cuda:0', grad_fn=<AddBackward0>)


 80%|████████  | 20/25 [01:04<00:16,  3.22s/it]

Batch training loss is tensor(18.6285, device='cuda:0', grad_fn=<AddBackward0>)


 84%|████████▍ | 21/25 [01:07<00:12,  3.21s/it]

Batch training loss is tensor(19.3718, device='cuda:0', grad_fn=<AddBackward0>)


 88%|████████▊ | 22/25 [01:10<00:09,  3.21s/it]

Batch training loss is tensor(18.8421, device='cuda:0', grad_fn=<AddBackward0>)


 92%|█████████▏| 23/25 [01:14<00:06,  3.22s/it]

Batch training loss is tensor(18.9032, device='cuda:0', grad_fn=<AddBackward0>)


 96%|█████████▌| 24/25 [01:17<00:03,  3.22s/it]

Batch training loss is tensor(19.3168, device='cuda:0', grad_fn=<AddBackward0>)


100%|██████████| 25/25 [01:20<00:00,  3.22s/it]


Batch training loss is tensor(18.8312, device='cuda:0', grad_fn=<AddBackward0>)


100%|██████████| 7/7 [00:00<00:00,  8.56it/s]


3 
	 - Train loss: 19.0078 
	 - Val loss: 18.9445 
	 - Train gen acc: 13.3295 
	 - Train disc acc: 0.0000 
	 - Val gen acc: 5.1948 
	 - Val disc acc: 0.0000


  4%|▍         | 1/25 [00:03<01:17,  3.22s/it]

Batch training loss is tensor(18.5450, device='cuda:0', grad_fn=<AddBackward0>)


  8%|▊         | 2/25 [00:06<01:14,  3.23s/it]

Batch training loss is tensor(19.2960, device='cuda:0', grad_fn=<AddBackward0>)


 12%|█▏        | 3/25 [00:09<01:10,  3.22s/it]

Batch training loss is tensor(18.8023, device='cuda:0', grad_fn=<AddBackward0>)


 16%|█▌        | 4/25 [00:12<01:07,  3.22s/it]

Batch training loss is tensor(19.0524, device='cuda:0', grad_fn=<AddBackward0>)


 20%|██        | 5/25 [00:16<01:04,  3.21s/it]

Batch training loss is tensor(18.6884, device='cuda:0', grad_fn=<AddBackward0>)


 24%|██▍       | 6/25 [00:19<01:01,  3.22s/it]

In [None]:
plt.figure()
x = [i for i in range(len(list_train_loss))]
plt.plot(x, list_train_loss)
plt.plot(x, list_val_loss)
plt.show()

x = [i for i in range(len(list_train_loss))]
plt.plot(x, list_train_gen_acc, label="train gen acc")
plt.plot(x, list_train_disc_acc, label = "train disc acc")
plt.plot(x, list_val_gen_acc, label ="Valid gen acc" )
plt.plot(x, list_val_loss, label ="Valid disc acc" )
plt.legend()
plt.show()
list_train_gen_acc, list_train_disc_acc, list_val_gen_acc, list_val_gen_acc

In [27]:
# Example of target with class indice
loss = nn.CrossEntropyLoss(reduction = "mean")
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
print(input, target,output)

# Example of target with class probabilities
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)
output = loss(input, target)
print(input, target,output)

tensor([[ 0.8779,  0.9025, -0.1308, -0.1728,  0.3057],
        [ 0.5910,  1.8592, -1.4776,  1.7094, -0.2206],
        [ 1.2816,  0.5546,  0.1915, -0.2074, -1.3647]], requires_grad=True) tensor([3, 0, 0]) tensor(1.6992, grad_fn=<NllLossBackward0>)
tensor([[ 2.0994, -0.0871,  0.5965, -1.5593, -0.1185],
        [-0.9349,  1.0487, -0.9134, -1.3678, -0.3626],
        [ 0.6379, -1.1598,  0.7783, -1.7011,  1.0480]], requires_grad=True) tensor([[0.2759, 0.2942, 0.1516, 0.1326, 0.1457],
        [0.1417, 0.0869, 0.0849, 0.4153, 0.2712],
        [0.1582, 0.0491, 0.0480, 0.5533, 0.1914]]) tensor(2.3461, grad_fn=<DivBackward1>)
