# Deep Learning Project - Part3
<div style="text-align: center">
<h1 style = "color: red"> Sharif University Of Technology</h1>
<h2 style = "color: green"> DR. Fatemizadeh </h2>
<h3 style = "color: cyan"> Authors: Amirreza Velaee - Hessam Hosseini - Amirabbas Afzali - Mahshad Moradi<h3>
</div>

In [1]:
import argparse
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
import json
from tqdm import tqdm, trange
from sklearn.metrics import precision_recall_fscore_support, matthews_corrcoef
import pickle
from torch.utils.data import random_split
from torch.utils.data import Dataset, Subset
from torch.utils.data import DataLoader,Dataset
from torch.nn.modules import ReLU,Linear,Dropout
import time
import math
import datetime
import torch.nn.functional as F
from collections import OrderedDict


# Set random seed for reproducibility
manualSeed = 42
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)

ngpu = 1

Random Seed:  42


In [2]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
ngpu = torch.cuda.device_count()

There are 2 GPU(s) available.
We will use the GPU: Tesla T4


## load the dataset:

In [3]:
!pip install gdown 
import gdown 



In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

# id = "11YeloR2eTXcTzdwI04Z-M2QVvIeQAU6-"
id = "1G-XttJCGvkAVkU9N_W_PxR0Cx099qbUa"
gdown.download_folder(id=id, quiet=True, use_cookies=False)

['/kaggle/working/SubtaskB/subtaskB_dev.jsonl',
 '/kaggle/working/SubtaskB/subtaskB_train.jsonl']

**Subtask B:**

An object of the JSON has the following format:


-  **id** -> identifier of the example,
- **label** -> label (human: 0, chatGPT: 1, cohere: 2, davinci: 3, bloomz: 4, dolly: 5),
- **text** -> text generated by machine or written by human,
- **model** -> model name that generated data,
- **source** -> source (Wikipedia, Wikihow, Peerread, Reddit, Arxiv) on English


In [5]:
# content/drive/My Drive/Project
with open('/kaggle/working/SubtaskB/subtaskB_train.jsonl', 'r') as file:
    
    lines = file.readlines()

# Parse each line as a JSON object
train_objects = [json.loads(line) for line in lines]

In [6]:
with open('/kaggle/working/SubtaskB/subtaskB_dev.jsonl', 'r') as file:
    lines = file.readlines()

dev_objects = [json.loads(line) for line in lines]

In [7]:
len(train_objects), len(dev_objects) 

(71027, 3000)

an example:

In [8]:
train_objects[100].keys()

dict_keys(['text', 'model', 'source', 'label', 'id'])

In [9]:
train_objects[100]['model'], train_objects[100]['source'],train_objects[100]['label'],train_objects[100]['id']

('chatGPT', 'wikihow', 1, 100)

In [10]:
print(train_objects[104]['text'])

If you're looking to set up a home PC with multiple modems and phone lines, you're in luck! With a little bit of know-how, you can have a faster, more reliable internet connection than ever before. Here's how to get started.

Part 1: Setting Up Your Modems

Step 1: Find out if your local ISP supports Multi-Link accounts. Some ISPs offer plans that allow you to connect multiple modems to your computer to boost your internet speeds. Check with your provider to see if this is an option for you.

Step 2: Get a second modem. Most computers come with one modem installed, but if you need a second one, you can easily purchase one online or at a tech store. Make sure it's compatible with your operating system before you buy.

Step 3: Use the dial-up creation dialog if you're using Windows XP. Navigate to the control panel and click "network and internet connections." From there, select "create a new connection" and then choose "connect to the internet."

Step 4: Click next, and it will ask for 

More details about the dataset and the Exploratory Data Analysis have been reported in `EDA.ipynb`.

## load the pretrain `RoBERTa` from $huggingface$:

In [11]:
!pip install transformers
!pip install sentencepiece



In [12]:
from transformers import RobertaTokenizer, RobertaModel
from transformers import DistilBertTokenizer, DistilBertModel

import sentencepiece
from transformers import get_constant_schedule_with_warmup

In [13]:
# Load pre-trained BERT model and tokenizer
# model_name = 'roberta-large'
# tokenizer = RobertaTokenizer.from_pretrained(model_name)
# bert_model = RobertaModel.from_pretrained(model_name)

model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
bert_model = DistilBertModel.from_pretrained(model_name)

In [14]:
class BERT_Embedder(nn.Module):
    def __init__(self, bert_modele):
        super(BERT_Embedder, self).__init__()
        self.bert = bert_modele

    def forward(self, encoded_ids,attention_mask):
        outputs = self.bert(encoded_ids,attention_mask)
        last_hidden_states = outputs.last_hidden_state[:,0]  # return embedding of 'CLS' token for classification.

        return last_hidden_states

Now we can define **Discriminator** and **Generator** completely:

In [15]:
# custom weights initialization
# def weights_init(m):
#     classname = m.__class__.__name__
#     if classname.find('Conv') != -1:
#         nn.init.normal_(m.weight.data, 0.0, 0.02)
#     elif classname.find('BatchNorm') != -1:
#         nn.init.normal_(m.weight.data, 1.0, 0.02)
#         nn.init.constant_(m.bias.data, 0)


import torch.nn.init as init

def custom_weights_init(m):
    if isinstance(m, nn.Linear):
        init.xavier_normal_(m.weight.data)
        if m.bias is not None:
            init.constant_(m.bias.data, 0)


In [16]:
class Discriminator(nn.Module):
    def __init__(self, input_size, num_classes,dropout_rate=0.2,relu_slop=0.2):
        super(Discriminator, self).__init__()
        self.input_size = input_size
        self.num_classes = num_classes

        self.main = torch.nn.Sequential(
            Dropout(p=dropout_rate),
            
            Linear(in_features=self.input_size, out_features=512, bias=True),
            nn.LeakyReLU(relu_slop, inplace=True),
            Dropout(p=dropout_rate),
            
            Linear(in_features=512, out_features=256, bias=True),
            nn.LeakyReLU(relu_slop, inplace=True),
            Dropout(p=dropout_rate),
            
            Linear(in_features=256, out_features=256, bias=True),
            nn.LeakyReLU(relu_slop, inplace=True),
            Dropout(p=dropout_rate),
        )

        self.logit = nn.Linear(256,self.num_classes+1)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, input):
        last_rep = self.main(input)  # for do 'feature matching'
        logits = self.logit(last_rep)
        probs = self.softmax(logits)
        return last_rep, logits, probs


In [17]:
class Generator1(nn.Module):
    def __init__(self, input_size, output_size,dropout_rate=0.2,relu_slop=0.2):
        super(Generator1, self).__init__()

        self.input_size = input_size
        self.output_size = output_size

        self.main = torch.nn.Sequential(
            Linear(in_features=self.input_size, out_features=256, bias=True),
            nn.LeakyReLU(relu_slop, inplace=True),
            Dropout(p=dropout_rate, inplace=False),
            Linear(in_features=256, out_features=self.output_size, bias=True),
        )

    def forward(self, input):
        return self.main(input)


Set Hyperparameter :

In [18]:
num_classes = 6
input_size = 768
noise_size = 100
label_list = list(range(6))

### Define the Dataset class:

In [19]:

class SemEval_Dataset(Dataset):
    def __init__(self, json_file,label_list,label_masks,
                 max_seq_length, tokenizer,dtype=torch.long):

        self.json_file = json_file
        self.dtype = dtype
        self.label_list = label_list # [0, 1, 2, 3, 4, 5]
        self.max_seq_length = max_seq_length
        self.tokenizer = tokenizer
        self.label_masks = label_masks

    def __len__(self):
        return len(self.json_file)

    def feature_extractor(self, text, label=None):
        features = []
        tokenized_text = tokenizer(text,padding='max_length', truncation=True,
                                   max_length=self.max_seq_length,
                                   return_tensors="pt")

        input_ids = tokenized_text['input_ids']
        input_mask = tokenized_text['attention_mask']

        if len(input_ids) > self.max_seq_length:
            input_ids = input_ids[0:(self.max_seq_length)]   # crop long sentences
            input_mask = input_mask[0:(self.max_seq_length)]

        assert len(input_ids[0]) == self.max_seq_length
        assert len(input_mask[0]) == self.max_seq_length

        if label != None:
            return input_ids, input_mask, label
        else:
            return input_ids, input_mask

    def __getitem__(self, idx):
        data = self.json_file[idx]
        input_ids, input_mask, label_id = self.feature_extractor(data['text'], label=data['label'])

        return input_ids.squeeze(0), input_mask.squeeze(0), data['label'], self.label_masks[idx]

Create Dataset and Dataloader :

In [20]:
max_seq_length = 256
batch_size = 128

In [21]:
unlabeled_examples = True
labeled_ratio = 0.5                  # 0.01, 0.1 ,0.05 ,0.5 
train_dataset_size_labeled = int(labeled_ratio* len(train_objects))

#The labeled (train) dataset is assigned with a mask set to True
train_label_masks = torch.ones(train_dataset_size_labeled, dtype=bool)
#If unlabel examples are available
if unlabeled_examples:
  #The unlabeled (train) dataset is assigned with a mask set to False
    tmp_masks = torch.zeros(len(train_objects)- train_dataset_size_labeled , dtype=bool)
    train_label_masks = torch.concatenate([train_label_masks,tmp_masks])
    idx = torch.randperm(train_label_masks.shape[0])
    train_label_masks = train_label_masks[idx].view(train_label_masks.size())

assert train_label_masks.shape[0] == len(train_objects)
train_dataset = SemEval_Dataset(train_objects, label_list, train_label_masks,max_seq_length, tokenizer)
# train_dataset = torch.utils.data.Subset(train_dataset, [i for i in range(train_dataset_size)])

In [22]:
train_size = int(0.8 * len(train_objects))  # 80% for training
val_size = len(train_objects) - train_size  # Remaining 20% for validation

train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])
test_label_masks = torch.ones(len(dev_objects), dtype=bool)
test_dataset = SemEval_Dataset(dev_objects, label_list, test_label_masks,max_seq_length, tokenizer)

train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, num_workers=os.cpu_count(),shuffle=True, drop_last=False)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size, num_workers=os.cpu_count(),shuffle=True, drop_last=False)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, num_workers=os.cpu_count(),shuffle=True, drop_last=False)

In [23]:
print('Number of train samples: ', len(train_dataset))
print('Number of validation samples: ', len(val_dataset))
print('Number of test samples: ', len(test_dataset))

Number of train samples:  56821
Number of validation samples:  14206
Number of test samples:  3000


In [24]:
# test train_dataset
betch = next(iter(train_dataloader))
print(f'input_ids shape: {betch[0].shape}, \ninput_mask shape: {betch[1].shape}, \
        \nlabel_ids shape: {betch[2].shape},\nlabel_mask shape: {betch[3].shape}')

input_ids shape: torch.Size([128, 256]), 
input_mask shape: torch.Size([128, 256]),         
label_ids shape: torch.Size([128]),
label_mask shape: torch.Size([128])


In [25]:
for batch in train_dataloader:
    print(batch[2][batch[3]])
    print()
#     print(batch[3])
    break

tensor([1, 1, 5, 5, 5, 2, 4, 2, 4, 0, 2, 3, 1, 2, 2, 2, 5, 2, 5, 3, 0, 1, 3, 2,
        0, 1, 0, 3, 3, 3, 2, 2, 5, 4, 2, 0, 3, 4, 0, 5, 3, 2, 2, 3, 1, 4, 2, 2,
        5, 3, 0, 4, 5, 3, 1, 4, 5, 2, 1, 1, 5, 3, 1, 0, 3, 5, 5])



## GAN-BERT

In [27]:
import matplotlib.pyplot as plt

img = plt.imread('/content/drive/My Drive/Project/GAN-BERT.png')
plt.imshow(img);
plt.axis('off');

Hyperparameters:

In [28]:
epoch_num = 8

learning_rate = 5e-4
noise_size = 100
epsilon = 1e-8
warmup_proportion = 0.1  #TODO

In [29]:
# Create the Discriminator and Generator
discriminator = Discriminator(input_size,num_classes).to(device)
generator1 = Generator1(noise_size,input_size).to(device)
bert = BERT_Embedder(bert_model).to(device)


# Handle multi-GPU if desired
if (device.type == 'cuda') and (ngpu > 1):
    discriminator = nn.DataParallel(discriminator, list(range(ngpu)))
    generator = nn.DataParallel(generator1, list(range(ngpu)))    
    bert = nn.DataParallel(bert, list(range(ngpu)))

# weights initialization   # TODO : Xavier weight initialization
discriminator.apply(custom_weights_init)
generator1.apply(custom_weights_init)

# print(discriminator)
# print()
# print(generator1)
# print()
# print(bert)

Generator1(
  (main): Sequential(
    (0): Linear(in_features=100, out_features=256, bias=True)
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=256, out_features=768, bias=True)
  )
)

### Define the Optimizers and Scheduler:

In [30]:
gen_optimizer = torch.optim.AdamW(generator1.parameters(), lr=learning_rate)
dis_optimizer = torch.optim.AdamW(list(bert.parameters()) + list(discriminator.parameters()), lr=learning_rate)

#scheduler
num_train_examples = len(train_dataset)
num_train_steps = int(num_train_examples / batch_size * epoch_num) 
num_warmup_steps = int(num_train_steps * warmup_proportion)

scheduler_d = get_constant_schedule_with_warmup(dis_optimizer, 
                                       num_warmup_steps = num_warmup_steps)
scheduler_g = get_constant_schedule_with_warmup(gen_optimizer, 
                                       num_warmup_steps = num_warmup_steps) 

 The loss function of $Discriminator$ is defined as:  $$\quad L_{\mathcal{D}}=L_{\mathcal{D}_{\text {sup. }}}+L_{\mathcal{D}_{\text {unsup. }}}$$
 where:
$$
\begin{aligned}
L_{\mathcal{D}_{\text {sup. }}} & =-\mathbb{E}_{x, y \sim p_d} \log \left[p_{\mathrm{m}}(\hat{y}=y \mid x, y \in(1, \ldots, k))\right] \\
L_{\mathcal{D}_{\text {unsup. }}} & =-\mathbb{E}_{x \sim p_d} \log \left[1-p_{\mathrm{m}}(\hat{y}=y \mid x, y=k+1)\right] -\mathbb{E}_{x \sim \mathcal{G}} \log \left[p_{\mathrm{m}}(\hat{y}=y \mid x, y=k+1)\right] \\
\rightarrow  L_{\mathcal{D}_{\text {unsup. }}} & =-\mathbb{E}_{x \sim p_d} [\log (\mathcal{D}(x))] -\mathbb{E}_{x \sim \mathcal{G}} 
[\log (1-\mathcal{D}(x))]
\end{aligned}
$$

And loss function of $Generator$ is defined as: $$\quad L_{\mathcal{G}}=L_{\mathcal{G}_{\text {feature matching }}}+L_{\mathcal{G}_{\text {unsup. }}}$$ 
where:

$$L_{\mathcal{G}_{\text {unsup. }}}=-\mathbb{E}_{x \sim \mathcal{G}} 
\log \left[1-p_m(\hat{y}=y \mid x, y=k+1)\right]$$

$$ L_{\mathcal{G}_{\text {feature matching }}} = ||\mathbb{E}_{x \sim p_d} f(x) 
- \mathbb{E}_{x \sim \mathcal{G}} f(x) ||_2^2$$


In [178]:
class GANBERT():
    def __init__(self, discriminator, generator, bert,gen_optimizer, dis_optimizer,
                scheduler_d,scheduler_g, path): 

        self.discriminator = discriminator
        self.generator = generator
        self.bert = bert
        self.gen_optimizer = gen_optimizer
        self.dis_optimizer = dis_optimizer
        self.scheduler_g = scheduler_g
        self.scheduler_d = scheduler_d
        self.nll_loss = torch.nn.CrossEntropyLoss(ignore_index=-1 , label_smoothing=0.005) # which one
        self.path = path
 
    def trainer(self, epoch_num, label_list,labeled_ratio,
               train_dataloader, val_dataloader=None,report=True):
        
        best_score = 1e-5
    
        def format_time(elapsed):
            '''
            Takes a time in seconds and returns a string hh:mm:ss
            '''
            # Round to the nearest second.
            elapsed_rounded = int(round((elapsed)))
            # Format as hh:mm:ss
            return str(datetime.timedelta(seconds=elapsed_rounded))
        
        results = []
        print(f'With labeled_ratio : {labeled_ratio}\n')
        for epoch in range(epoch_num):
            # Measure how long the each epoch takes.
            t0 = time.time()
            
            self.bert.train()
            self.generator.train()
            self.discriminator.train()

            tr_g_loss = 0
            tr_d_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0

            print(f'Epoch {epoch+1}/{epoch_num} :')
            for step, batch in enumerate(train_dataloader):

                src_input_ids, src_input_mask, label_ids, b_label_mask = batch # unpacking
                src_input_ids = src_input_ids.to(device)
                src_input_mask = src_input_mask.to(device)
                label_ids = label_ids.to(device)
                b_label_mask = b_label_mask.to(device)

                self.bert.zero_grad()
                self.discriminator.zero_grad()

                # Real representations
                embedding = self.bert(src_input_ids, attention_mask=src_input_mask)
                D_real_features, D_real_logits, D_real_probs = self.discriminator(embedding)

                # Random noise
                noise = torch.zeros(src_input_ids.shape[0],noise_size, device=device).uniform_(0, 1)#.requires_grad_(True)
                gen_rep = self.generator(noise)
                
                ############################
                # Update Generator network: minimize -E[log(D(G(z)))] + feature_matching LOSS
                ###########################
                D_fake_features, D_fake_logits, D_fake_probs = self.discriminator(gen_rep) # .detach()

                g_loss_d = -1 * torch.mean(torch.log(1 - D_fake_probs[:,-1] + epsilon))
                g_feat_reg = torch.mean(torch.pow(torch.mean(D_real_features, dim=0) - torch.mean(D_fake_features, dim=0), 2))
                g_loss = g_loss_d + g_feat_reg

                ############################
                #  Update Discriminator network: minimize -E[log(D(x)) + log(1 - D(G(z)))]
                ###########################
                logits = D_real_logits[:,0:-1]
                log_probs = F.log_softmax(logits, dim=-1)
                # The discriminator provides an output for labeled and unlabeled real data
                # so the loss evaluated for unlabeled data is ignored (masked)
                label2one_hot = torch.nn.functional.one_hot(label_ids, len(label_list))
                per_example_loss = -torch.sum(label2one_hot * log_probs, dim=-1)
                per_example_loss = torch.masked_select(per_example_loss, b_label_mask.to(device))
                labeled_example_count = per_example_loss.type(torch.float32).numel()

                # It may be the case that a batch does not contain labeled examples,
                # so the "supervised loss" in this case is not evaluated
                if labeled_example_count == 0:
                    D_L_Supervised = 0
                else:
                    D_L_Supervised = torch.div(torch.sum(per_example_loss.to(device)), labeled_example_count)

                D_L_unsupervised1U = -1 * torch.mean(torch.log(1 - D_real_probs[:, -1] + epsilon))
                D_L_unsupervised2U = -1 * torch.mean(torch.log(D_fake_probs[:, -1] + epsilon))
                d_loss = D_L_Supervised + D_L_unsupervised1U + D_L_unsupervised2U

                #---------------------------------
                #  OPTIMIZATION
                #---------------------------------
                self.gen_optimizer.zero_grad()
                self.dis_optimizer.zero_grad()

                # Calculate weigth updates
                # retain_graph=True is required since the underlying graph will be deleted after backward
                g_loss.backward(retain_graph=True)
                d_loss.backward()

                # Apply modifications
                self.gen_optimizer.step()
                self.dis_optimizer.step()

                # Save the losses to print them later
                tr_g_loss += g_loss.item()
                tr_d_loss += d_loss.item()

            # Output training stats
                if report:
                    if step % 100 == 0:
                        print('''\n[Epoch %d/%d][iter %d/%d]\ttotal Loss_D: %.4f\ttotal Loss_G: %.4f,\n
                        details of Loss_D:  Loss_D_sup: %.4f,\t-E[log(D(x))]: %.4f,\t-E[log(1-D(G(z)))]: %.4f,\n
                        details of Loss_G:  -E[log(D(G(z)))]: %.4f,\tLoss_G_feat: %.4f\n
                        D(x): %.4f\tD(G(z)): %.4f'''
                          %(epoch+1, epoch_num, step, len(train_dataloader),
                            d_loss.mean().item(), g_loss.mean().item(), 
                            D_L_Supervised, D_L_unsupervised1U, D_L_unsupervised2U,
                            g_loss_d, g_feat_reg,
                            torch.mean(D_real_probs[:, -1]).item(), 
                              torch.mean(D_fake_probs[:, -1]).item() ))
                        
                        # save checkpoints
                        self.save_checkpoint(epoch)

            # Update the learning rate with the scheduler
            self.scheduler_d.step()
            self.scheduler_g.step()

            # Calculate the average loss over all of the batches.
            avg_train_loss_g = tr_g_loss / len(train_dataloader)
            avg_train_loss_d = tr_d_loss / len(train_dataloader)

            # Measure how long this epoch took.
            epoch_time = format_time(time.time() - t0)

            print("")
            print(f' Training stats at epoch {epoch+1}: ')
            print(f' G_loss = {tr_g_loss}, D_loss = {tr_d_loss} \n')
            print(" Training epcoh took: {:}".format(epoch_time))
            
            if val_dataloader != None:
                self.bert.eval()
                self.discriminator.eval() 

                all_preds = np.array([])
                all_label_ids = np.array([])
                eval_loss = 0
                nb_eval_steps = 0
                for val_step, batch in enumerate(val_dataloader):
                    src_input_ids, src_input_mask, label_ids, _ = batch # unpacking
                    src_input_ids = src_input_ids.to(device)
                    src_input_mask = src_input_mask.to(device)
                    label_ids = label_ids.to(device)


                    with torch.no_grad():
                        doc_rep = self.bert(src_input_ids, attention_mask=src_input_mask)
                        _, logits, _ = self.discriminator(doc_rep)
#                         probs = torch.nn.functional.softmax(logits[:,0:-1], dim=-1)
                        probs = logits[:,0:-1]
                        tmp_eval_loss = self.nll_loss(probs, label_ids.view(-1))

                    eval_loss += tmp_eval_loss.mean().item()

                    probs = probs.detach().cpu().numpy()
                    label_ids = label_ids.to('cpu').numpy()
                    all_preds = np.append(all_preds, np.argmax(probs, axis=1))
                    all_label_ids = np.append(all_label_ids, label_ids)

                    nb_eval_steps += 1

                eval_loss = eval_loss / nb_eval_steps
#                 precision, recall, f1, _ = precision_recall_fscore_support(all_label_ids, all_preds, average="micro",
#                                                                          labels=list(range(0,len(label_list))))
                mcc = matthews_corrcoef(all_preds, all_label_ids)
                acc = (all_preds == all_label_ids).sum().item() / all_label_ids.shape[0]


                # Output validation stats
                print(f'Validation stats: ')
                print('Loss: %.4f,\tAccuracy: %.4f,\tmcc: %.4f,'
                  %(eval_loss,acc,mcc))
                
            result = {
                'epoch': epoch_time,
                "gen_loss": tr_g_loss,
                "dis_loss": tr_d_loss,
                "eval_loss": eval_loss,
                "mcc": mcc,
                "acc": acc,
                'epoch_time': epoch_time}
#                 "precision_micro": precision,
#                 "recall_micro": recall,
#                 "f1_micro": f1,
                

            results.append(result)
            # save checkpoints
            self.save_checkpoint(epoch,results)
            
            # seva best model
            if acc > best_score:
                best_score = acc 
                self.save_checkpoint(epoch ,result,best=True)
            
    def save_checkpoint(self,epoch,results=None,best=False):
        checkpoint = {
            'epoch': epoch + 1,
            'bert_state_dict': self.bert.state_dict(),
            'disc_state_dict': self.discriminator.state_dict(),
            'gen_state_dict': self.generator.state_dict(),
            'disc_optimizer_state_dict': self.dis_optimizer.state_dict(),
            'gen_optimizer_state_dict': self.gen_optimizer.state_dict(),
            }
        # for colab : /content/drive/My Drive/Project/checkpoints
        if best:
            torch.save(checkpoint, f'{self.path}/GAN_BERT_checkpoint_BEST.pth')
            if results!= None:
                with open(f'{self.path}/results_BEST.pickle', 'wb') as file:
                    pickle.dump(results, file)
            
        else:
            torch.save(checkpoint, f'{self.path}/GAN_BERT_checkpoint{epoch+1}.pth')
            if results!= None:
                with open(f'{self.path}/results.pickle', 'wb') as file:
                    pickle.dump(results, file)
    
    def test(self, test_dataloader):
        self.bert.eval()
        self.discriminator.eval()

        all_preds = np.array([])
        all_label_ids = np.array([])
        eval_loss = 0
        nb_eval_steps = 0
        nll_loss = torch.nn.CrossEntropyLoss(ignore_index=-1)
        
        for val_step, batch in enumerate(test_dataloader):
            src_input_ids, src_input_mask, label_ids, _ = batch # unpacking
            src_input_ids = src_input_ids.to(device)
            src_input_mask = src_input_mask.to(device)
            label_ids = label_ids.to(device)


            with torch.no_grad():
                doc_rep = self.bert(src_input_ids, attention_mask=src_input_mask)
                _, logits, _ = self.discriminator(doc_rep)
            # probs = torch.nn.functional.softmax(logits[:,0:-1], dim=-1)
            probs = logits[:,0:-1]    
            tmp_eval_loss = nll_loss(probs, label_ids.view(-1))

            eval_loss += tmp_eval_loss.mean().item()

            probs = probs.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            
            all_preds = np.append(all_preds, np.argmax(probs, axis=1))
            all_label_ids = np.append(all_label_ids, label_ids)

            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps


        mcc = matthews_corrcoef(all_preds, all_label_ids)
        acc = (all_preds == all_label_ids).sum().item() / all_label_ids.shape[0]
        
        # Output validation stats
        print(f'Test stats: ')
        print('Total loss: %.4f,\tAccuracy: %.4f,\tmcc: %.4f,'
          %(eval_loss,acc,mcc) ) 
        return all_preds

    @staticmethod
    def rename_keys(original_ordered_dict):
        new_keys_mapping = dict()
        for a in list(original_ordered_dict.keys()):
            new_keys_mapping[a] = a.split('module.')[-1] 

        return OrderedDict((new_keys_mapping.get(k, k), v) for k, v in original_ordered_dict.items())

    
    def load_checkpoint(self,checkpoint_path):
        state_dict = torch.load(checkpoint_path)
        
        if (device.type == 'cuda') and (ngpu > 1):
            # Load the state dictionary into the model
            self.bert.load_state_dict(state_dict['bert_state_dict'])
            self.discriminator.load_state_dict(state_dict['disc_state_dict'])
            self.generator.load_state_dict(state_dict['gen_state_dict'])
            self.dis_optimizer.load_state_dict(state_dict['disc_optimizer_state_dict'])
            self.gen_optimizer.load_state_dict(state_dict['gen_optimizer_state_dict'])
            
        else: 
            self.bert.load_state_dict(self.rename_keys(state_dict['bert_state_dict']))
            self.discriminator.load_state_dict(self.rename_keys(state_dict['disc_state_dict']))
            self.generator.load_state_dict(self.rename_keys(state_dict['gen_state_dict']))
            self.dis_optimizer.load_state_dict(state_dict['disc_optimizer_state_dict'])
            self.gen_optimizer.load_state_dict(state_dict['gen_optimizer_state_dict'])

        print('Loaded !')
            
    def plot_results():
        pass
            
    def show_tensorboard():
        pass

In [None]:
# !pip install numba

# from numba import cuda
# device = cuda.get_current_device()
# device.reset()
# torch.cuda.empty_cache()  

In [32]:
# !ls
!mkdir part3
!ls

SubtaskB  part3


In [33]:
ganbert = GANBERT(discriminator, generator1, bert,gen_optimizer, dis_optimizer,
                scheduler_d,scheduler_g, path='/kaggle/working/part3') 

In [34]:
ganbert.trainer(epoch_num,label_list,labeled_ratio,train_dataloader, val_dataloader,report=True)

With labeled_ratio : 0.5

Epoch 1/8 :

[Epoch 1/8][iter 0/444]	total Loss_D: 4.0050	total Loss_G: 0.1567,

                        details of Loss_D:  Loss_D_sup: 1.7802,	-E[log(D(x))]: 0.1595,	-E[log(1-D(G(z)))]: 2.0653,

                        details of Loss_G:  -E[log(D(G(z)))]: 0.1381,	Loss_G_feat: 0.0186

                        D(x): 0.1464	D(G(z)): 0.1287

[Epoch 1/8][iter 100/444]	total Loss_D: 4.1242	total Loss_G: 0.1571,

                        details of Loss_D:  Loss_D_sup: 1.8994,	-E[log(D(x))]: 0.1696,	-E[log(1-D(G(z)))]: 2.0551,

                        details of Loss_G:  -E[log(D(G(z)))]: 0.1400,	Loss_G_feat: 0.0171

                        D(x): 0.1549	D(G(z)): 0.1303

[Epoch 1/8][iter 200/444]	total Loss_D: 4.1256	total Loss_G: 0.1593,

                        details of Loss_D:  Loss_D_sup: 1.9195,	-E[log(D(x))]: 0.1614,	-E[log(1-D(G(z)))]: 2.0447,

                        details of Loss_G:  -E[log(D(G(z)))]: 0.1412,	Loss_G_feat: 0.0181

                        

In [35]:
test_res = ganbert.test(test_dataloader) 

Test stats: 
Total loss: 2.1642,	Accuracy: 0.5450,	mcc: 0.4876,


The `Matthews correlation coefficient` , is a measure of the quality of classifications in machine learning. It takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes. It's defined in the range from -1 to 1, with 1 being a perfect prediction, 0 being the result of a random prediction, and -1 indicating total disagreement between prediction and observation.

---
### Load the best model

In [179]:
discriminator = Discriminator(input_size,num_classes).to(device)
generator1 = Generator1(noise_size,input_size).to(device)
bert = BERT_Embedder(bert_model).to(device)

if (device.type == 'cuda') and (ngpu > 1):
    discriminator = nn.DataParallel(discriminator, list(range(ngpu)))
    generator = nn.DataParallel(generator1, list(range(ngpu)))    
    bert = nn.DataParallel(bert, list(range(ngpu)))
    
gen_optimizer = torch.optim.AdamW(generator1.parameters(), lr=learning_rate)
dis_optimizer = torch.optim.AdamW(list(bert.parameters()) + list(discriminator.parameters()), lr=learning_rate)

#scheduler
num_train_examples = len(train_dataset)
num_train_steps = int(num_train_examples / batch_size * epoch_num) 
num_warmup_steps = int(num_train_steps * warmup_proportion)

scheduler_d = get_constant_schedule_with_warmup(dis_optimizer, 
                                       num_warmup_steps = num_warmup_steps)
scheduler_g = get_constant_schedule_with_warmup(gen_optimizer, 
                                       num_warmup_steps = num_warmup_steps) 

ganbert_best = GANBERT(discriminator, generator1, bert,gen_optimizer, dis_optimizer,
                scheduler_d,scheduler_g, path='/kaggle/working/part3') 

In [180]:
path = '/kaggle/working/part3/GAN_BERT_checkpoint_BEST.pth'
ganbert_best.load_checkpoint(path)

Loaded !


In [182]:
test_res = ganbert_best.test(test_dataloader) 

Test stats: 
Total loss: 2.1513,	Accuracy: 0.5450,	mcc: 0.4876,


In [None]:
plt.figure(figsize=(10,5))
plt.title("Generator and Discriminator Loss During Training")
plt.plot(G_losses,label="G")
plt.plot(D_losses,label="D")
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()